In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle

In [3]:
df = pd.read_csv("online_sales.csv")
df.shape

(316200, 4)

In [4]:
df.head()

Unnamed: 0,age,new_user,total_pages_visited,converted
0,25,1,1,0
1,23,1,5,0
2,28,1,4,0
3,39,1,5,0
4,30,1,6,0


In [5]:
df.converted.value_counts()

converted
0    306000
1     10200
Name: count, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316200 entries, 0 to 316199
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   age                  316200 non-null  int64
 1   new_user             316200 non-null  int64
 2   total_pages_visited  316200 non-null  int64
 3   converted            316200 non-null  int64
dtypes: int64(4)
memory usage: 9.6 MB


In [7]:
input_columns = [column for column in df.columns if column != "converted"]
print(input_columns)

['age', 'new_user', 'total_pages_visited']


In [8]:
output_column = 'converted'
print(output_column)

converted


In [9]:
X = df.loc[:, input_columns].values
y = df.loc[:,output_column]
print(X.shape, y.shape)

(316200, 3) (316200,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=555, stratify=y)

In [11]:
print(np.sum(y_train))
print(np.sum(y_test))

7140
3060


In [12]:
logreg=LogisticRegression(class_weight='balanced').fit(X_train,y_train)
logreg.score(X_test, y_test)

0.9369175627240144

In [13]:
predictions=logreg.predict(X_test)

In [14]:
print(classification_report(y_test,predictions,target_names=["NonConverted", "Converted"]))

              precision    recall  f1-score   support

NonConverted       1.00      0.94      0.97     91800
   Converted       0.33      0.92      0.49      3060

    accuracy                           0.94     94860
   macro avg       0.66      0.93      0.73     94860
weighted avg       0.98      0.94      0.95     94860


In [15]:
pickle_out = open("logreg.pkl","wb")
pickle.dump(logreg, pickle_out)
pickle_out.close()

In [16]:
pickle_in = open("logreg.pkl","rb")
model=pickle.load(pickle_in)

In [17]:
model.predict([[45,0,5]])[0]

np.int64(0)

In [18]:
df_test = pd.read_csv("test_data.csv")
predictions = model.predict(df_test)
print(predictions)

[0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0]


