In [None]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif


In [None]:
df = pd.read_csv('train.csv')

2- Data Exploration and Preprocessing

In [None]:
df.isnull().sum()

Student ID                           0
Age                                 92
Gender                               0
Home Region                          2
Home City                            2
Program ID                           0
Program Main Category Code           0
Program Sub Category Code          935
Technology Type                   2982
Program Skill Level               1646
Program Presentation Method          0
Program Start Date                   0
Program End Date                     0
Program Days                         0
Completed Degree                     0
Level of Education                  26
Education Speaciality              277
College                           3890
University Degree Score             81
University Degree Score System      81
Employment Status                  566
Job Type                          4567
Still Working                     4567
Y                                    0
dtype: int64

In [None]:
df.drop(['College', 'Job Type', 'Still Working', 'Home City', 'Home Region'], axis=1, inplace=True)


In [None]:
df.shape

(6548, 19)

In [None]:
df.isnull().sum()

Student ID                           0
Age                                 92
Gender                               0
Program ID                           0
Program Main Category Code           0
Program Sub Category Code          935
Technology Type                   2982
Program Skill Level               1646
Program Presentation Method          0
Program Start Date                   0
Program End Date                     0
Program Days                         0
Completed Degree                     0
Level of Education                  26
Education Speaciality              277
University Degree Score             81
University Degree Score System      81
Employment Status                  566
Y                                    0
dtype: int64

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['University Degree Score'].fillna(df['University Degree Score'].mean(), inplace=True)

df['University Degree Score System'].fillna(5, inplace=True)


In [None]:
df['Technology Type'].fillna(df['Technology Type'].mode()[0], inplace=True)
df['Education Speaciality'].fillna(df['Education Speaciality'].mode()[0], inplace=True)
df['Program Skill Level'].fillna(df['Program Skill Level'].mode()[0], inplace=True)
df['Level of Education'].fillna(df['Level of Education'].mode()[0], inplace=True)
df['Employment Status'].fillna(df['Employment Status'].mode()[0], inplace=True)

df['Program Sub Category Code'].fillna(df['Program Main Category Code'], inplace=True)


In [None]:
df['Program Start Date'] = pd.to_datetime(df['Program Start Date'])
df['Program End Date'] = pd.to_datetime(df['Program End Date'])

# Convert date columns to datetime
date_columns = ['Program Start Date', 'Program End Date']
df[date_columns] = df[date_columns].apply(pd.to_datetime)

# Split date columns into separate day, month, and year columns for start and end dates
df['Start Day'] = df['Program Start Date'].dt.day
df['Start Month'] = df['Program Start Date'].dt.month
df['Start Year'] = df['Program Start Date'].dt.year
df['End Day'] = df['Program End Date'].dt.day
df['End Month'] = df['Program End Date'].dt.month
df['End Year'] = df['Program End Date'].dt.year

df['Program Duration'] = (df['Program End Date'] - df['Program Start Date']).dt.days
df['Start Quarter'] = df['Program Start Date'].dt.quarter
df['End Quarter'] = df['Program End Date'].dt.quarter

df.drop(columns=date_columns, inplace=True)

In [None]:
df.isnull().sum()

Student ID                        0
Age                               0
Gender                            0
Program ID                        0
Program Main Category Code        0
Program Sub Category Code         0
Technology Type                   0
Program Skill Level               0
Program Presentation Method       0
Program Days                      0
Completed Degree                  0
Level of Education                0
Education Speaciality             0
University Degree Score           0
University Degree Score System    0
Employment Status                 0
Y                                 0
Start Day                         0
Start Month                       0
Start Year                        0
End Day                           0
End Month                         0
End Year                          0
Program Duration                  0
Start Quarter                     0
End Quarter                       0
dtype: int64

3- Model Building

In [None]:
# Define features and target
X = df.drop(['Y', 'Student ID'], axis=1)  # Exclude 'Y' and 'Student ID' from features
y = df['Y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Define the preprocessing steps for categorical and numerical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')  # Encode categorical variables

numerical_transformer = StandardScaler()  # Scale numerical features

# Combine preprocessing steps for both categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ])

# Apply preprocessing to the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Train a logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_preprocessed, y_train)

# Predict on the test set
y_pred_logistic = logistic_model.predict(X_test_preprocessed)

# Calculate evaluation metrics
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
precision_logistic = precision_score(y_test, y_pred_logistic)
recall_logistic = recall_score(y_test, y_pred_logistic)
f1_logistic = f1_score(y_test, y_pred_logistic)

# Print the results
print("Logistic Regression:")
print("Accuracy:", accuracy_logistic)
print("Precision:", precision_logistic)
print("Recall:", recall_logistic)
print("F1 Score:", f1_logistic)

Logistic Regression:
Accuracy: 0.9015267175572519
Precision: 0.6820809248554913
Recall: 0.6145833333333334
F1 Score: 0.6465753424657533


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


4- Model Improvement

In [None]:
from sklearn.metrics import f1_score, recall_score

# Define a dictionary of algorithms to try
algorithms = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier()
}

best_model_name = ''
best_model_score = 0.0

# Iterate over each algorithm and evaluate its performance
for algorithm_name, algorithm in algorithms.items():
    # Define the preprocessing and modeling pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', algorithm)
    ])

        # Define the hyperparameter grid for grid search
    param_grid = {}
    if algorithm_name == 'Random Forest':
        param_grid = {
            'model__n_estimators': [100],
            'model__max_depth': [None],
            'model__min_samples_split': [2]
        }
    elif algorithm_name == 'SVM':
        param_grid = {
            'model__C': [1],
            'model__kernel': ['linear']
        }
    elif algorithm_name == 'Gradient Boosting':
        param_grid = {
            'model__n_estimators': [100],
            'model__learning_rate': [0.1],
            'model__max_depth': [3]
        }

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', refit=True)
    grid_search.fit(X_train, y_train)

    # Get the best model and its performance
    best_model = grid_search.best_estimator_
    cv_score = grid_search.best_score_

    # Get the predictions for the test set
    y_pred = best_model.predict(X_test)

    # Calculate the F1 score and recall
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Check if this model has the best score so far
    if cv_score > best_model_score:
        best_model_name = algorithm_name
        best_model_score = cv_score

    # Print the results for the current algorithm
    print(algorithm_name)
    print("Best Hyperparameters:", grid_search.best_params_)
    print("CV Score:", cv_score)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("------------------------------------")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression
Best Hyperparameters: {}
CV Score: 0.8835427648607072
F1 Score: 0.6465753424657533
Recall: 0.6145833333333334
------------------------------------
Random Forest
Best Hyperparameters: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 100}
CV Score: 0.8804891474733335
F1 Score: 0.6306818181818181
Recall: 0.578125
------------------------------------
SVM
Best Hyperparameters: {'model__C': 1, 'model__kernel': 'linear'}
CV Score: 0.8865983872496482
F1 Score: 0.6361323155216285
Recall: 0.6510416666666666
------------------------------------
Gradient Boosting
Best Hyperparameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}
CV Score: 0.8862165255874654
F1 Score: 0.6528497409326426
Recall: 0.65625
------------------------------------
