In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, log_loss

from google.colab import files
uploaded = files.upload()

# Load dataset
df = pd.read_csv("research.csv")

# Clean RD_Value and Relative_Sampling_Error
df['RD_Value'] = pd.to_numeric(df['RD_Value'], errors='coerce')
df['Relative_Sampling_Error'] = pd.to_numeric(df['Relative_Sampling_Error'], errors='coerce')
df = df.dropna(subset=['RD_Value'])

# Fill missing Relative_Sampling_Error
df['Relative_Sampling_Error'] = df['Relative_Sampling_Error'].fillna(df['Relative_Sampling_Error'].median())

# Drop unused columns
df.drop(columns=['Status', 'Footnotes', 'Variable', 'Breakdown', 'Unit'], inplace=True, errors='ignore')

# Encode categorical data
df['Breakdown_category'] = LabelEncoder().fit_transform(df['Breakdown_category'])

# Create classification target: High vs Low RD_Value
df['RD_Class'] = df['RD_Value'].apply(lambda x: 1 if x >= 100 else 0)

# Prepare features and target
X = df[['Year', 'Breakdown_category', 'Relative_Sampling_Error']]
y = df['RD_Class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model
model = LogisticRegression()

# Train-test 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nCross-Entropy Loss:", log_loss(y_test, y_prob))

# 10-Fold Cross Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='accuracy')
print("\n10-Fold CV Accuracy Scores:", cv_scores)
print("Average CV Accuracy:", cv_scores.mean())

# Train-Test Split Ratios
for ratio in [0.2, 0.3, 0.4]:
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=ratio, random_state=42)
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(f"\nTrain-Test Split {100 - int(ratio*100)}-{int(ratio*100)}: Accuracy = {acc:.4f}")


Saving research.csv to research.csv
Confusion Matrix:
 [[3830  417]
 [ 919 1300]]

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.90      0.85      4247
           1       0.76      0.59      0.66      2219

    accuracy                           0.79      6466
   macro avg       0.78      0.74      0.76      6466
weighted avg       0.79      0.79      0.79      6466


Cross-Entropy Loss: 0.5421463514029765

10-Fold CV Accuracy Scores: [0.79174397 0.80232019 0.79164733 0.77540603 0.78143852 0.78143852
 0.77679814 0.79443155 0.79953596 0.79257541]
Average CV Accuracy: 0.7887335617647186

Train-Test Split 80-20: Accuracy = 0.7968

Train-Test Split 70-30: Accuracy = 0.7934

Train-Test Split 60-40: Accuracy = 0.7887
