In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0


In [4]:
df = data.copy()

In [5]:
df.describe()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4999.5,45.8009,1.9748,1702.44,1.7243,0.3706,0.1215,0.1172,0.0591,0.0277,0.018,64.4352,0.1109
std,2886.89568,8.030274,1.766883,523.789062,0.798845,0.48299,0.326724,0.321675,0.235824,0.16412,0.132958,13.595024,0.314024
min,0.0,23.0,0.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0
25%,2499.75,40.0,0.0,1300.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,0.0
50%,4999.5,46.0,2.0,1550.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,0.0
75%,7499.25,51.0,3.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0
max,9999.0,68.0,9.0,3950.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,1.0


In [6]:
df = df.drop('Id', axis=1)

In [7]:
df.head()

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0


In [8]:
categorical_features = ['physics', 'chemistry', 'biology', 'english', 'geography', 'history']

In [9]:
y = df['choose']
X = df.drop('choose', axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
y_train = np.array(y_train)
y_train = y_train.reshape((-1,1))
y_test = np.array(y_test)
y_test = y_test.reshape((-1,1))

In [12]:
from Models.Models import PolynomialFeatures, StandardScaler, LogisticRegression, GridSearch, Node, Leaf, RandomForestClassifier

In [13]:
def data_prepare(X_train, X_test, categorical_features, degree):
    X_train_categorical = X_train[categorical_features]
    X_test_categorical = X_test[categorical_features]
    X_train_without_categorical = X_train.drop(categorical_features, axis=1)
    X_test_without_categorical = X_test.drop(categorical_features, axis=1)
    X_train_categorical = np.array(X_train_categorical)
    X_test_categorical = np.array(X_test_categorical)
    X_train_without_categorical = np.array(X_train_without_categorical)
    X_test_without_categorical = np.array(X_test_without_categorical)
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train_without_categorical)
    X_test_poly = poly.fit_transform(X_test_without_categorical)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_poly)
    X_test_scaled = scaler.transform(X_test_poly)
    X_train_scaled_with_category = np.hstack((X_train_scaled, X_train_categorical))
    X_test_scaled_with_category = np.hstack((X_test_scaled, X_test_categorical))
    return X_train_scaled_with_category, X_test_scaled_with_category

In [14]:
X_train_scaled_with_category_2, X_test_scaled_with_category_2 = data_prepare(X_train, X_test, categorical_features, 2)

In [15]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2'], 'alpha': [0.0001, 0.00001],
              'max_iter': [100, 200, 300, 400, 500, 600, 700], 'random_state': [42]}

In [16]:
gs = GridSearch('LogisticRegression', 'roc_auc_score', param_grid)

In [17]:
best_roc, best_params = gs.search(X_train_scaled_with_category_2, X_test_scaled_with_category_2, y_train, y_test)
best_roc, best_params

(0.6886940932334031,
 {'C': 1,
  'penalty': 'l2',
  'alpha': 0.0001,
  'max_iter': 100,
  'random_state': 42})

In [18]:
X_train_scaled_with_category_3, X_test_scaled_with_category_3 = data_prepare(X_train, X_test, categorical_features, 3)

In [19]:
best_roc_3, best_params_3 = gs.search(X_train_scaled_with_category_3, X_test_scaled_with_category_3, y_train, y_test)
best_roc_3, best_params_3

(0.7503568051274893,
 {'C': 1,
  'penalty': 'l2',
  'alpha': 0.0001,
  'max_iter': 100,
  'random_state': 42})

In [20]:
X_train_scaled_with_category_4, X_test_scaled_with_category_4 = data_prepare(X_train, X_test, categorical_features, 4)

In [21]:
best_roc_4, best_params_4 = gs.search(X_train_scaled_with_category_4, X_test_scaled_with_category_4, y_train, y_test)
best_roc_4, best_params_4

(0.765854843793782,
 {'C': 1,
  'penalty': 'l2',
  'alpha': 1e-05,
  'max_iter': 200,
  'random_state': 42})

In [22]:
X_train_scaled_with_category_5, X_test_scaled_with_category_5 = data_prepare(X_train, X_test, categorical_features, 5)

In [23]:
best_roc_5, best_params_5 = gs.search(X_train_scaled_with_category_5, X_test_scaled_with_category_5, y_train, y_test)
best_roc_5, best_params_5

(0.7692926502521737,
 {'C': 0.01,
  'penalty': 'l2',
  'alpha': 1e-05,
  'max_iter': 200,
  'random_state': 42})

In [24]:
X_train_scaled_with_category_6, X_test_scaled_with_category_6 = data_prepare(X_train, X_test, categorical_features, 6)

In [25]:
best_roc_6, best_params_6 = gs.search(X_train_scaled_with_category_6, X_test_scaled_with_category_6, y_train, y_test)
best_roc_6, best_params_6

(0.7710854318436555,
 {'C': 1,
  'penalty': 'l2',
  'alpha': 1e-05,
  'max_iter': 400,
  'random_state': 42})

In [26]:
X_train_scaled_with_category_7, X_test_scaled_with_category_7 = data_prepare(X_train, X_test, categorical_features, 7)

In [27]:
best_roc_7, best_params_7 = gs.search(X_train_scaled_with_category_7, X_test_scaled_with_category_7, y_train, y_test)
best_roc_7, best_params_7

(0.7750825248669113,
 {'C': 1,
  'penalty': 'l2',
  'alpha': 1e-05,
  'max_iter': 500,
  'random_state': 42})

In [28]:
y_train_labels = y_train.flatten()
y_test_labels = y_test.flatten()

In [29]:
X_train_array = np.array(X_train)
X_test_array = np.array(X_test)

In [30]:
param_grid_rf = {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 15, 20],\
                 'max_features': [3, 4, 5, 6], 'min_samples_leaf': [1, 5, 10], 'random_state': [42]}

In [31]:
gs_rf = GridSearch('RandomForestClassifier', 'roc_auc_score', param_grid_rf)

In [32]:
best_roc_rf, best_params_rf = gs_rf.search(X_train_array, X_test_array, y_train_labels, y_test_labels)
best_roc_rf, best_params_rf

(0.6912765524306528,
 {'n_estimators': 50,
  'max_depth': None,
  'max_features': 5,
  'min_samples_leaf': 10,
  'random_state': 42})

In [33]:
best_roc_rf, best_params_rf = (0.6912765524306528,
 {'n_estimators': 50,
  'max_depth': None,
  'max_features': 5,
  'min_samples_leaf': 10,
  'random_state': 42})

In [44]:
X_test_final = pd.read_csv('test.csv')

In [45]:
X_test_final.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,10000,32.0,2.0,2700.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,10001,35.0,6.0,1800.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2,10002,44.0,2.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
3,10003,44.0,4.0,2950.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,92.0
4,10004,38.0,3.0,1400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0


In [46]:
final_predictions = X_test_final['Id']

In [47]:
X_test_final = X_test_final.drop('Id', axis=1)

In [48]:
y_train_final = np.array(y)
X_test_final_array = np.array(X_test_final)
y_train_final_labels = y_train_final.flatten()
X_train_final_array = np.array(X)

In [49]:
final_model_rf = RandomForestClassifier(**best_params_f1_rf)

In [50]:
final_model_rf.fit(X_train_final_array, y_train_final_labels)

In [51]:
predictions_rf = final_model_rf.predict(X_test_final_array)

In [52]:
predictions_rf = pd.Series(predictions_rf)

In [53]:
frame_rf = {'Id': final_predictions, 'choose': predictions_rf}

In [54]:
result_rf = pd.DataFrame(frame_rf)

In [55]:
result_rf.to_csv('predictions4.csv', index=False)

In [48]:
final_model = LogisticRegression(**best_params_4)

In [49]:
y_train_final = np.array(y)
y_train_final = y_train_final.reshape((-1,1))

In [50]:
final_model.fit(X_train_scaled_with_category_final, y_train_final)

In [51]:
predictions = final_model.predict(X_test_scaled_with_category_final)

In [52]:
predictions_reshaped = predictions.flatten()

In [53]:
predictions_series = pd.Series(predictions_reshaped)

In [56]:
frame = {'Id': final_predictions, 'choose': predictions_series}

In [57]:
result = pd.DataFrame(frame)

In [58]:
result.head()

Unnamed: 0,Id,choose
0,10000,0.411022
1,10001,0.435864
2,10002,0.402715
3,10003,0.353501
4,10004,0.419069


In [59]:
result.to_csv('predictions.csv', index=False)

In [60]:
final_model_2 = RandomForestClassifier(**best_params_6)

In [61]:
X_test_final_array = np.array(X_test_final)
y_train_final_labels = y_train_final.flatten()
X_train_final_array = np.array(X)

In [62]:
final_model_2.fit(X_train_final_array, y_train_final_labels)

In [63]:
predictions_2 = final_model_2.predict(X_test_final_array)

In [64]:
predictions_2 = pd.Series(predictions_2)

In [65]:
frame_2 = {'Id': final_predictions, 'choose': predictions_2}

In [66]:
result_2 = pd.DataFrame(frame_2)

In [67]:
result_2.to_csv('predictions2.csv', index=False)

In [68]:
X_train_scaled_with_category_final_3, X_test_scaled_with_category_final_3 = data_prepare(X, X_test_final, categorical_features, 3)

In [69]:
final_model_3 = LogisticRegression(**best_params_3)

In [70]:
final_model_3.fit(X_train_scaled_with_category_final_3, y_train_final)

In [71]:
predictions_3 = final_model_3.predict(X_test_scaled_with_category_final_3)

In [72]:
predictions_3 = pd.Series(predictions_3.flatten())

In [73]:
frame_3 = {'Id': final_predictions, 'choose': predictions_3}

In [74]:
result_3 = pd.DataFrame(frame_3)

In [75]:
result_3.to_csv('predictions3.csv', index=False)