In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score
import pickle

In [4]:
df = pd.read_csv("data.csv")

columns = ['Make', 'Model', 'Year', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Vehicle Style',
       'highway MPG', 'city mpg', 'MSRP']

df = df[columns].copy()

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for col in categorical_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

df = df.fillna(0)
df["above_average"] = np.where(df["msrp"] > df["msrp"].mean(), 1, 0)

In [15]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [6]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

categorical = ['make', 'model', 
       'transmission_type', 'vehicle_style']

In [9]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)

    return dv, model

In [8]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X=dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [7]:
C = 1.0
n_splits = 5

In [16]:
Kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, test_idx in Kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[test_idx]

    y_train = df_train.above_average.values
    y_val = df_val.above_average.values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
    
print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

C=1.0 0.980 +- 0.003


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
scores

[0.976558589775321,
 0.9779365681488323,
 0.9827435725837935,
 0.9775723607160783,
 0.9832875457875458]

In [19]:
dv, model = train(df_full_train, df_full_train.above_average.values, C=1.0)
y_pred = predict(df_test, dv, model)

y_test = df_test.above_average.values
auc = roc_auc_s core(y_test, y_pred)
auc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9792908289461444

### Save the model

In [21]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [22]:
# f_out = open(output_file, 'wb')
# pickle.dump((dv, model), f_out)
# f_out.close()

In [23]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

### Load the model

In [1]:
import pickle

In [2]:
input_file = 'model_C=1.0.bin'

In [4]:
with open(input_file, 'rb') as f_in:
    (dv, model) = pickle.load(f_in)

In [5]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

In [38]:
{'make': 'aston_martin',
  'model': 'v8_vantage',
  'transmission_type': 'automated_manual',
  'vehicle_style': 'convertible',
  'year': 2014,
  'engine_hp': 420.0,
  'engine_cylinders': 8.0,
  'highway_mpg': 21,
  'city_mpg': 14}

{'make': 'aston_martin',
 'model': 'v8_vantage',
 'transmission_type': 'automated_manual',
 'vehicle_style': 'convertible',
 'year': 2014,
 'engine_hp': 420.0,
 'engine_cylinders': 8.0,
 'highway_mpg': 21,
 'city_mpg': 14}

In [39]:
X = dv.transform([car])

In [40]:
model.predict_proba(X)[0, 1]

0.9999999999999729