In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
train_model_classification = pd.read_csv('train_model_classification.csv')

In [4]:
train_model_classification = train_model_classification.drop('Unnamed: 0',axis=1)

In [5]:
print(len(train_model_classification[train_model_classification['CrossedMeanSales']==1]))
print(len(train_model_classification[train_model_classification['CrossedMeanSales']==0]))

588642
255480


*Prints the number of rows with CrossedMeanSales 1 and 0 respectively*

In [6]:
train_model_classification.columns

Index(['Store', 'DayOfWeek', 'Sales', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'Year', 'Month', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'MeanSalesStore',
       'CrossedMeanSales'],
      dtype='object')

In [7]:
train_model_classification = train_model_classification.drop('Sales',axis=1)
train_model_classification = train_model_classification.drop('MeanSalesStore',axis=1)

**Logistic Regression**

In [8]:
array = train_model_classification.values
X = train_model_classification.drop('CrossedMeanSales', axis=1)
Y = train_model_classification['CrossedMeanSales']
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy:",result*100.0)

Accuracy: 99.99246125624191


*accuracy = (correct predictions / total predictions) *100*

*The accuracy is 74.6%. This means 74.6% of data is classified correctly.*

In [9]:
from sklearn.metrics import confusion_matrix
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted) 
print("Confusion Matrix")
print(matrix)

Confusion Matrix
[[ 84111     21]
 [     0 194429]]


*The top left value is the True - Positive i.e, 51386 rows are classified correctly*

*The bottom right value is the True - Negative i.e, 156443 rows are not classified*

*The botton left value is the False - Negative i.e, 37986*

*The top right value is the False - Positive  i.e, 32746*

**Logistic regression with cross validation**

In [10]:
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5, random_state=7)  
scoring = 'roc_auc' 
#results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
#print("AUC:",results.mean())
results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring="accuracy")
print("Accuracy:",results.mean())

Accuracy: 0.9998921425359126


*accuracy = (correct predictions / total predictions)*

*The accuracy is 99.9%. This means 99.9% of data is classified correctly.*

**Logistic Regression with SGD and Data Normalization**

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

logistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True,
                         max_iter=10000, tol=1e-5, random_state=0)


pipeline_order_logistic = [('scaler', StandardScaler()), ('logistic', logistic)]

Model_Pipeline_logistic = Pipeline(pipeline_order_logistic)

# evaluate pipeline
kfold = KFold(n_splits=3, random_state=7)
results = cross_val_score(Model_Pipeline_logistic, X_train, Y_train, cv=kfold,scoring= 'roc_auc')
Model_Pipeline_logistic.fit(X_train, Y_train)
preds = Model_Pipeline_logistic.predict(X_test)
print("AUC:",results.mean())

AUC: 0.8210121702588463


*accuracy = (correct predictions / total predictions)*

*The accuracy is 82.1%. This means 82.1% of data is classified correctly.*

In [12]:
matrix = confusion_matrix(Y_test, preds) 
print("Confusion Matrix")
print(matrix)

Confusion Matrix
[[ 53963  30169]
 [ 40362 154067]]


*The top left value is the True - Positive i.e, 53963 rows are classified correctly*

*The bottom right value is the True - Negative i.e, 154067 rows are not classified*

*The botton left value is the False - Negative i.e, 40362*

*The top right value is the False - Positive  i.e, 30169*

In [14]:
print("Accuracy for logistic regression with SGD",Model_Pipeline_logistic.score(X_test,Y_test)*100)

Accuracy for logistic regression with SGD 74.68023161892728


**Calculate the probability of getting the dependent variable**


*Probability = correct predictions/ total predictions*

*Here, the probability is 0.74*