정원혁 William, 2020, D+
# Overall Outline

1. Read Data

2. Independent variable / Dependent variable: Ranking / ......
    - Remove unnecessary columns
    - (Create additional necessary columns)
3. Handle NA values
4. Handle outliers
5. Dummy encoding
    - Convert categorical / numerical variables using astype()
    - For numerical variables: scale transformation
        min / max scale
        mean / std scale
6. Split into train / test sets
7. Declare model
8. Train model
9. Predict
10. Performance
    - MSE, R-square
    - Accuracy



# 1. Read Data

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Read Data
data = pd.read_csv('https://raw.githubusercontent.com/Peter-Mfitumukiza/ai-class/main/student-mat.csv', sep=';')

data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


# 2. Independent variable / Dependent variable: Ranking / ......
    - Remove unnecessary columns
    - (Create additional necessary columns)

In [8]:
X = data.drop('G3', axis=1)
y = data['G3']

# 3. Handle NA values

In [11]:
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
     ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Dummy encoding / Scaling
    - Convert categorical / numerical variables using astype()
    - For numerical variables: scale transformation
        min / max scale
        mean / std scale

In [None]:
# scaler = MinMaxScaler()
# X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 6. Split into train / test sets


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Declare models


In [13]:
# Regression models
reg_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}

# Classification models
class_models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# 8. Train and evaluate regression models

In [23]:
# Function to evaluate regression models
def evaluate_regression(models, X_train, X_test, y_train, y_test):
    results = {}
    predictions = {}
    for name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results[name] = {'MSE': mse, 'R2': r2}
        predictions[name] = y_pred
    return results, predictions

# Evaluate regression models
reg_results, reg_predictions = evaluate_regression(reg_models, X_train, X_test, y_train, y_test)

# Train and evaluate classification models

In [22]:
# Function to evaluate classification models
def evaluate_classification(models, X_train, X_test, y_train, y_test):
    results = {}
    predictions = {}
    for name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        results[name] = {'Accuracy': accuracy, 'Classification Report': report}
        predictions[name] = y_pred
    return results, predictions

# Evaluate classification models
# Convert G3 to binary (pass/fail) for classification
y_binary = (y >= 10).astype(int)  # Assuming 10 is the pass mark
y_train_binary = (y_train >= 10).astype(int)
y_test_binary = (y_test >= 10).astype(int)
class_results, class_predictions = evaluate_classification(class_models, X_train, X_test, y_train_binary, y_test_binary)

# 9. Predict


In [26]:
# Create DataFrame for regression predictions
reg_pred_df = pd.DataFrame({'Actual': y_test})
for name, preds in reg_predictions.items():
    reg_pred_df[name] = preds

# Create DataFrame for classification predictions
class_pred_df = pd.DataFrame({'Actual': y_test_binary})
for name, preds in class_predictions.items():
    class_pred_df[name] = preds

print("\nRegression Predictions:")
print(reg_pred_df)

print("\nClassification Predictions:")
print(class_pred_df)


Regression Predictions:
     Actual  Linear Regression  Decision Tree  Random Forest
78       10           6.015625            8.0           8.24
371      12          11.515625           12.0          11.69
248       5           2.867188            8.0           6.47
55       10           8.820312           10.0           9.82
390       9           8.562500            9.0           8.75
..      ...                ...            ...            ...
364      12          10.171875           11.0          10.78
82        6           6.265625            5.0           5.89
114       9           7.664062            8.0           8.90
3        15          12.398438           13.0          14.18
18        5           4.351562            5.0           5.75

[79 rows x 4 columns]

Classification Predictions:
     Actual  Logistic Regression  Decision Tree  Random Forest
78        1                    0              0              0
371       1                    1              1              1
24

# 10. Performance


In [27]:
# Print results
print("Regression Results:")
for name, metrics in reg_results.items():
    print(f"{name}: MSE = {metrics['MSE']:.4f}, R2 = {metrics['R2']:.4f}")

print("\nClassification Results:")
for name, metrics in class_results.items():
    print(f"{name}: Accuracy = {metrics['Accuracy']:.4f}")
    print(f"Classification Report:\n{pd.DataFrame(metrics['Classification Report']).transpose()}\n")

Regression Results:
Linear Regression: MSE = 5.6638, R2 = 0.7238
Decision Tree: MSE = 6.6076, R2 = 0.6778
Random Forest: MSE = 3.9931, R2 = 0.8053

Classification Results:
Logistic Regression: Accuracy = 0.8734
Classification Report:
              precision    recall  f1-score    support
0              0.840000  0.777778  0.807692  27.000000
1              0.888889  0.923077  0.905660  52.000000
accuracy       0.873418  0.873418  0.873418   0.873418
macro avg      0.864444  0.850427  0.856676  79.000000
weighted avg   0.872180  0.873418  0.872178  79.000000

Decision Tree: Accuracy = 0.8481
Classification Report:
              precision    recall  f1-score    support
0              0.777778  0.777778  0.777778  27.000000
1              0.884615  0.884615  0.884615  52.000000
accuracy       0.848101  0.848101  0.848101   0.848101
macro avg      0.831197  0.831197  0.831197  79.000000
weighted avg   0.848101  0.848101  0.848101  79.000000

Random Forest: Accuracy = 0.9114
Classification 