<center>Logistic Regression w/ Organics.csv dataset & statsmodel library</center>

In [1]:
# import libraries 
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# create dataframe
dfRL = pd.read_csv("Organics.csv")

In [3]:
# examine dataframes characterstics 
dfRL.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         22223 non-null  int64  
 1   DemAffl    21138 non-null  float64
 2   DemAge     20715 non-null  float64
 3   DemGender  19711 non-null  object 
 4   PromClass  22223 non-null  object 
 5   PromSpend  22223 non-null  float64
 6   PromTime   21942 non-null  float64
 7   TargetBuy  22223 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 1.4+ MB


In [4]:
# count rows w/ missing values
dfRL.isnull().sum()

ID              0
DemAffl      1085
DemAge       1508
DemGender    2512
PromClass       0
PromSpend       0
PromTime      281
TargetBuy       0
dtype: int64

In [5]:
# drop rows w/ missing values
dfRL_nonull = dfRL.dropna()
dfRL_nonull.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17272 entries, 0 to 22221
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         17272 non-null  int64  
 1   DemAffl    17272 non-null  float64
 2   DemAge     17272 non-null  float64
 3   DemGender  17272 non-null  object 
 4   PromClass  17272 non-null  object 
 5   PromSpend  17272 non-null  float64
 6   PromTime   17272 non-null  float64
 7   TargetBuy  17272 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 1.2+ MB


In [6]:
# drop ID; doesnt help with the prediction model in this case
dfRL_nonull = dfRL_nonull.drop(["ID", "PromSpend"], axis = 1)
dfRL_nonull.head()

Unnamed: 0,DemAffl,DemAge,DemGender,PromClass,PromTime,TargetBuy
0,10.0,76.0,U,Gold,4.0,0
1,4.0,49.0,U,Gold,5.0,0
2,5.0,70.0,F,Silver,8.0,1
3,10.0,65.0,M,Tin,7.0,1
4,11.0,68.0,F,Tin,8.0,0


In [7]:
# create dummy columns

dfRL_nonull = pd.get_dummies(dfRL_nonull, dtype = int) 
dfRL_nonull

Unnamed: 0,DemAffl,DemAge,PromTime,TargetBuy,DemGender_F,DemGender_M,DemGender_U,PromClass_Gold,PromClass_Platinum,PromClass_Silver,PromClass_Tin
0,10.0,76.0,4.0,0,0,0,1,1,0,0,0
1,4.0,49.0,5.0,0,0,0,1,1,0,0,0
2,5.0,70.0,8.0,1,1,0,0,0,0,1,0
3,10.0,65.0,7.0,1,0,1,0,0,0,0,1
4,11.0,68.0,8.0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
22216,13.0,49.0,9.0,0,0,1,0,0,0,1,0
22218,13.0,65.0,5.0,0,1,0,0,0,0,1,0
22219,15.0,73.0,12.0,0,0,0,1,1,0,0,0
22220,9.0,70.0,5.0,0,1,0,0,1,0,0,0


In [8]:
# set up IV's & DV's for the logistic regression
# y is an array of the target values
y = np.array(dfRL_nonull["TargetBuy"])

# x is a df of the needed IV's
x = dfRL_nonull[["DemAffl", "DemAge", "PromTime", "DemGender_F", "DemGender_M", "PromClass_Platinum", "PromClass_Silver", "PromClass_Tin"]]


In [9]:
# split the dataset into training and testing 
# since random sampling is used, set the random seed to a fixed value 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 44)

In [10]:
# ready to run logistic regresion model
# add constatns to the independent data frames 
x_train = sm.add_constant(x_train)
x_test = sm.add_constant(x_test)

In [11]:
# fit logistic regression model and get a summary of results
model = sm.Logit(y_train, x_train)
result = model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.449569
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                12090
Model:                          Logit   Df Residuals:                    12081
Method:                           MLE   Df Model:                            8
Date:                Wed, 31 Jan 2024   Pseudo R-squ.:                  0.2230
Time:                        15:46:59   Log-Likelihood:                -5435.3
converged:                       True   LL-Null:                       -6994.9
Covariance Type:            nonrobust   LLR p-value:                     0.000
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -2.0906      0.190    -11.030      0.000      -2.462      -1.719
DemAf

In [12]:
# add predictions of the test data for the confusion matrix
x_test["Prediction_Prob"] = result.predict(x_test)
x_test.head()

Unnamed: 0,const,DemAffl,DemAge,PromTime,DemGender_F,DemGender_M,PromClass_Platinum,PromClass_Silver,PromClass_Tin,Prediction_Prob
10588,1.0,6.0,34.0,6.0,1,0,0,0,0,0.379534
15281,1.0,6.0,59.0,4.0,0,1,0,1,0,0.054972
8921,1.0,11.0,55.0,1.0,1,0,0,0,0,0.398116
20175,1.0,10.0,76.0,8.0,1,0,0,0,0,0.146168
14592,1.0,10.0,40.0,9.0,0,0,0,0,1,0.14465


In [13]:
# convert the probablities to prediction of 1 if probability >- 0.5, 0 otherwise
predictions = (x_test["Prediction_Prob"] >= 0.5).astype(int)
predictions

10588    0
15281    0
8921     0
20175    0
14592    0
        ..
20050    0
21925    0
10631    0
16430    0
9579     0
Name: Prediction_Prob, Length: 5182, dtype: int32

In [14]:
# import metrics from sklearn
from sklearn import metrics

#compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, predictions)

conf_matrix

array([[3604,  213],
       [ 797,  568]], dtype=int64)

TN = 3604
TP = 568
FN = 797
FP = 213