# Logistic Regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import preprocessing

Bring in the data and remove the null values from the data set 

In [5]:
df = pd.read_csv('data.csv', names = ["#A","B","#C","D","#E","F","G","H","I","J","#K","#L","#M","N","O"], header = None)

In [7]:
df = df[~df.eq("?").any(1)]

In [8]:
df.head()

Unnamed: 0,#A,B,#C,D,#E,F,G,H,I,J,#K,#L,#M,N,O
0,39,A0,77516,B0,13,C0,D0,E0,F0,G0,2174,0,40,H0,SMALL
1,50,A1,83311,B0,13,C1,D1,E1,F0,G0,0,0,13,H0,SMALL
2,38,A2,215646,B1,9,C2,D2,E0,F0,G0,0,0,40,H0,SMALL
3,53,A2,234721,B2,7,C1,D2,E1,F1,G0,0,0,40,H0,SMALL
4,28,A2,338409,B0,13,C1,D3,E2,F1,G1,0,0,40,H1,SMALL


# Normalize the Data 

In [15]:
df_num = df.iloc[:,[0,2,4,10,11,12]].values.astype(float)
df_cat = df.iloc[:,[1,3,5,6,7,8,9,13,14]]

In [17]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(df_num)
df_numeric = pd.DataFrame(x_scaled)

In [23]:
df_numeric.columns = ['#A','#C','#E','#K','#L','#M']
df_numeric.head()

Unnamed: 0,#A,#C,#E,#K,#L,#M
0,0.30137,0.043338,0.8,0.02174,0.0,0.397959
1,0.452055,0.047277,0.8,0.0,0.0,0.122449
2,0.287671,0.137244,0.533333,0.0,0.0,0.397959
3,0.493151,0.150212,0.4,0.0,0.0,0.397959
4,0.150685,0.220703,0.8,0.0,0.0,0.397959


In [30]:
cols = ['B','D','F','G','H','I','J','N']
df_categ = pd.get_dummies(df_cat, columns = cols, drop_first = True)

In [31]:
df_categ.head()

Unnamed: 0,O,B_A1,B_A2,B_A3,B_A4,B_A6,B_A7,D_B1,D_B10,D_B11,...,N_H37,N_H38,N_H39,N_H40,N_H41,N_H5,N_H6,N_H7,N_H8,N_H9
0,SMALL,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,SMALL,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,SMALL,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,SMALL,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SMALL,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
df_total = df_numeric.merge(df_categ, left_index= True, right_index = True)

In [122]:
df_total.head()

Unnamed: 0,#A,#C,#E,#K,#L,#M,O,B_A1,B_A2,B_A3,...,N_H37,N_H38,N_H39,N_H40,N_H41,N_H5,N_H6,N_H7,N_H8,N_H9
0,0.30137,0.043338,0.8,0.02174,0.0,0.397959,SMALL,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.452055,0.047277,0.8,0.0,0.0,0.122449,SMALL,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.287671,0.137244,0.533333,0.0,0.0,0.397959,SMALL,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0.493151,0.150212,0.4,0.0,0.0,0.397959,SMALL,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0.150685,0.220703,0.8,0.0,0.0,0.397959,SMALL,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Create a Logistic Model 

In [128]:
X = df_total.drop('O', axis = 1)
Y = df_total['O']

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, 
                                                  random_state=101)

In [124]:
logmodel = LogisticRegression(solver='liblinear')
logmodel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [125]:
predictions = logmodel.predict(X_test)

In [126]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

       LARGE       0.71      0.54      0.61      2168
       SMALL       0.85      0.92      0.89      6219

    accuracy                           0.82      8387
   macro avg       0.78      0.73      0.75      8387
weighted avg       0.82      0.82      0.82      8387



# See if a model with just numeric data is better

In [106]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(df_numeric, df['O'], test_size=0.30, 
                                                  random_state=101)

In [107]:
logmodel_numeric = LogisticRegression(solver='liblinear')
logmodel_numeric.fit(X_train1,y_train1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [108]:
prediction_numeric = logmodel_numeric.predict(X_test1)

In [109]:
print(classification_report(y_test1,prediction_numeric))

              precision    recall  f1-score   support

       LARGE       0.72      0.37      0.49      2306
       SMALL       0.82      0.95      0.88      6743

    accuracy                           0.80      9049
   macro avg       0.77      0.66      0.68      9049
weighted avg       0.79      0.80      0.78      9049



Categorical and normalized numeric data is the best fit for the model

# Create a prediction with the futures data set and first model

In [94]:
futures = pd.read_csv('futures.csv',names = ["#A","B","#C","D","#E","F","G","H","I","J","#K","#L","#M","N"],header=None)

In [95]:
futures = futures[~futures.eq("?").any(1)]
futures.head()

  result = method(y)


Unnamed: 0,#A,B,#C,D,#E,F,G,H,I,J,#K,#L,#M,N
0,38,A0,89814,B1,9,C1,D1,E1,F1,G0,0,0,50,H0
1,28,A1,336951,B2,12,C1,D2,E1,F1,G0,0,0,40,H0
2,44,A0,160323,B3,10,C1,D0,E1,F0,G0,7688,0,40,H0
4,63,A3,104626,B5,15,C1,D5,E1,F1,G0,3103,0,32,H0
5,65,A0,184454,B1,9,C1,D0,E1,F1,G0,6418,0,40,H0


In [96]:
future_num = futures.iloc[:,[0,2,4,10,11,12]].values.astype(float)
future_cat = futures.iloc[:,[1,3,5,6,7,8,9,13]]

In [97]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(future_num)
future_numeric = pd.DataFrame(x_scaled)
future_numeric.columns = ['#A','#C','#E','#K','#L','#M']
future_numeric.head()

Unnamed: 0,#A,#C,#E,#K,#L,#M
0,0.287671,0.051677,0.533333,0.0,0.0,0.5
1,0.150685,0.219011,0.733333,0.0,0.0,0.397959
2,0.369863,0.099418,0.6,0.076881,0.0,0.397959
3,0.630137,0.061706,0.933333,0.03103,0.0,0.316327
4,0.657534,0.115757,0.533333,0.064181,0.0,0.397959


In [98]:
future_categ = pd.get_dummies(future_cat, columns = cols, drop_first = True)
future_categ.head()

Unnamed: 0,B_A1,B_A3,B_A4,B_A5,B_A6,B_A7,D_B1,D_B10,D_B11,D_B12,...,N_H37,N_H38,N_H39,N_H4,N_H40,N_H5,N_H6,N_H7,N_H8,N_H9
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
future_total = future_numeric.merge(future_categ, left_index= True, right_index = True)
future_total.head()

Unnamed: 0,#A,#C,#E,#K,#L,#M,B_A1,B_A3,B_A4,B_A5,...,N_H37,N_H38,N_H39,N_H4,N_H40,N_H5,N_H6,N_H7,N_H8,N_H9
0,0.287671,0.051677,0.533333,0.0,0.0,0.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.150685,0.219011,0.733333,0.0,0.0,0.397959,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.369863,0.099418,0.6,0.076881,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.657534,0.115757,0.533333,0.064181,0.0,0.397959,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0.424658,0.180263,0.533333,0.03103,0.0,0.479592,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [127]:
future_predict = logmodel.predict(future_total)

ValueError: X has 95 features per sample; expecting 96