In [1]:
,# This piece of code enables display of multiple output from one cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [7]:
#importing libraries
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics

We examined the purchase behavior of new autos by examining a file named AutoPurchaseData. Twenty households were questioned about their income levels and the age of their oldest car. We then followed up six months later to see if they had purchased a new vehicle in that meantime. We recorded it as "0" if they hadn't, and as "1," if they had. This enabled us to identify any trends in the relationship between individuals' income and the age of their vehicles and their decision to purchase a new vehicle."

In [10]:
Auto_Purchase = pd.read_csv("/content/AutoPurchaseData.csv")

Auto_Purchase.shape

(20, 3)

The file has 20 rows and 3 columns

In [11]:
Auto_Purchase.head()

Unnamed: 0,Income,Age,Purchased
0,45000,2,0
1,40000,4,0
2,60000,3,1
3,50000,2,1
4,55000,2,0


To analyze the data better, we are converting the income information in the Income column from words to numbers.

In [12]:
Auto_Purchase['Income'] = Auto_Purchase['Income'].str.replace(',', '').astype(int)
Auto_Purchase.head()

Unnamed: 0,Income,Age,Purchased
0,45000,2,0
1,40000,4,0
2,60000,3,1
3,50000,2,1
4,55000,2,0


We are splitting the data into two groups: one for testing and one for training. Then, we are using logistic regression to analyze it and see how well it works in real situations.

In [14]:
#We are making a duplicate of the dataset.
Auto_Purchase_train = Auto_Purchase.copy()


Breaking down the X and Y variables and splitting the dataset using a specific starting point, which is called the seed, set at 11.

In [15]:
Auto_Purchase_train, Auto_Purchase_val = train_test_split(Auto_Purchase, test_size=0.20, random_state=11)


y_train = Auto_Purchase_train.Purchased.values
y_val = Auto_Purchase_val.Purchased.values

del Auto_Purchase_train['Purchased']
del Auto_Purchase_val['Purchased']


In [16]:
Auto_Purchase_train.head().T

Unnamed: 0,14,19,15,9,18
Income,43000,27000,49000,48000,34000
Age,9,6,2,1,5


In [18]:
prototype = LogisticRegression(solver = 'lbfgs')
prototype.fit(X= Auto_Purchase_train,y=y_train)

In [20]:
print(prototype.intercept_)

print(prototype.coef_)


# Precision of the model
prototype.score(X = Auto_Purchase_train , y = y_train)

[0.00389333]
[[-2.69051348e-05  2.43200559e-01]]


0.625

The model's accuracy stands at 62.5%, indicating a subpar performance. The coefficient β1, reflecting the impact of income on the likelihood of purchasing a car, is approximately -2.69e-05. This suggests that, in terms of the logarithm of odds, the probability of buying a car decreases by roughly -2.69e-05 for every $1 increase in income.

In [32]:
preds = prototype.predict(X= Auto_Purchase_train)
metrics.confusion_matrix(y_true=y_train, y_pred=preds)
print(metrics.classification_report(y_true=y_train, y_pred=preds) )

array([[8, 2],
       [4, 2]])

              precision    recall  f1-score   support

           0       0.67      0.80      0.73        10
           1       0.50      0.33      0.40         6

    accuracy                           0.62        16
   macro avg       0.58      0.57      0.56        16
weighted avg       0.60      0.62      0.60        16



In [35]:
preds = prototype.predict(X= Auto_Purchase_val)
metrics.confusion_matrix(y_true=y_val, y_pred=preds)
print(metrics.classification_report(y_true=y_val, y_pred=preds) )

array([[0, 0],
       [2, 2]])

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.50      0.67         4

    accuracy                           0.50         4
   macro avg       0.50      0.25      0.33         4
weighted avg       1.00      0.50      0.67         4



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Based on the training data, the model performs reasonably well, demonstrating good precision and recall for both categories. On fresh, untested data (validation set), however, its accuracy significantly decreases. Our conclusions might be more trustworthy and accurate if the dataset had more data points.

In [39]:
Income = 45000
Age = 5

# Need to anticipate the probability
probability = prototype.predict_proba([[Income, Age]])[:, 1][0]
print("Estimated probability:", probability)

Estimated probability: 0.5022912490812197




It is roughly 50.23% likely to be accurate to predict that a family with a $45,000 salary and a 5-year-old car will purchase a new car during the next six months.

In [40]:
# It is necessary to establish an interaction word.
Auto_Purchase_train['Income_Age_interaction'] = Auto_Purchase_train['Income'] * Auto_Purchase_train['Age']

# Fitting a logistic regression model with the interaction term is necessary.
X_interaction = Auto_Purchase_train[['Income', 'Age', 'Income_Age_interaction']]
prototype_interaction = LogisticRegression(solver = 'lbfgs')
prototype_interaction.fit(X_interaction, y_train)

In [41]:
print(prototype_interaction.intercept_)

print(prototype_interaction.coef_)


# prototype accuracy
prototype_interaction.score(X = X_interaction , y = y_train)

[-1.28904216e-09]
[[-4.61872873e-05 -1.54787042e-09  1.22293012e-05]]


0.6875

The presence of a non-zero coefficient for the interaction term and the increased accuracy of the model suggest that the interaction term could be beneficial to include.