In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn import set_config
from sklearn.compose import ColumnTransformer

In [2]:
mobileData = pd.read_csv("./train - train.csv")
X, y = mobileData.drop(['price_range'], axis=1) , mobileData['price_range']
set_config(transform_output="pandas")

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [4]:
print(y_train.value_counts()/len(y_train))

price_range
2    0.25
3    0.25
1    0.25
0    0.25
Name: count, dtype: float64


In [5]:
print(y_test.value_counts()/len(y_test))

price_range
2    0.25
1    0.25
0    0.25
3    0.25
Name: count, dtype: float64


In [6]:
pipeline = Pipeline([('scaler', StandardScaler()), ('imputer', SimpleImputer(strategy='median'))])

In [7]:
pipeline.fit(X_train)

X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

In [8]:
xgb = XGBClassifier(random_state=0)
rForest = RandomForestClassifier(random_state=0)
lg = LogisticRegression(random_state=0)

xgb.fit(X_train_transformed, y_train)
rForest.fit(X_train_transformed, y_train)
lg.fit(X_train_transformed, y_train)

In [9]:
xgb_pred_train = xgb.predict(X_train_transformed)
rForest_pred_train = rForest.predict(X_train_transformed)
lg_pred_train = lg.predict(X_train_transformed)

In [10]:
print(f"XGB:\t{accuracy_score(y_train, xgb_pred_train)}")
print(f"Random Forest:\t{accuracy_score(y_train, rForest_pred_train)}")
print(f"Logistic Regression:\t{accuracy_score(y_train, lg_pred_train)}")

XGB:	1.0
Random Forest:	1.0
Logistic Regression:	0.97625


In [11]:
xgb_pred_test = xgb.predict(X_test_transformed)
rForest_pred_test = rForest.predict(X_test_transformed)
lg_pred_test = lg.predict(X_test_transformed)

In [12]:
print(f"XGB:\t{accuracy_score(y_test, xgb_pred_test)}")
print(f"Random Forest:\t{accuracy_score(y_test, rForest_pred_test)}")
print(f"Logistic Regression:\t{accuracy_score(y_test, lg_pred_test)}")

XGB:	0.905
Random Forest:	0.8725
Logistic Regression:	0.9675


In [41]:
params = {'penalty':('l1', 'l2', None), 'tol':[1, 0.1,1e-3,1e-4], 'max_iter': [25, 50, 100]}
clf = GridSearchCV(LogisticRegression(random_state =0), params)
clf.fit(X_train_transformed, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [42]:
pred = clf.predict(X_train_transformed)
predT = clf.predict(X_test_transformed)

print(accuracy_score(y_train, pred), accuracy_score(y_test,predT))

0.990625 0.975


In [43]:
clf.best_params_

{'max_iter': 25, 'penalty': None, 'tol': 1}

In [46]:
lg_pre_final = LogisticRegression(max_iter = 100, penalty = None, tol = 1, random_state =0)
sbs = SFS(lg_pre_final,
         k_features=12,
         forward=False,
         floating=False,
         cv=0)
sbs.fit(X_train_transformed, y_train)
features = sbs.k_feature_names_

In [47]:
lg_final = LogisticRegression(max_iter = 100, penalty = None, tol = 1, random_state =0)
lg_final.fit(X_train_transformed[list(features)], y_train)

pred = lg_final.predict(X_train_transformed[list(features)])
predT = lg_final.predict(X_test_transformed[list(features)])

print(accuracy_score(y_train, pred), accuracy_score(y_test,predT))


0.990625 0.975


In [53]:
final_pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('imputer', SimpleImputer(strategy='median'))
])
final_pipeline = ColumnTransformer([
    ("final", final_pipeline, list(features))
])
final_pipeline.fit(X_train)

In [56]:
X_train_cp = final_pipeline.transform(X_train)
X_test_cp = final_pipeline.transform(X_test)
lg_final = LogisticRegression(max_iter = 100, penalty = None, tol = 1, random_state =0).fit(X_train_cp,y_train)

In [60]:
pred = lg_final.predict(X_train_cp)
predT = lg_final.predict(X_test_cp)

print(f"TRAIN\n{classification_report(y_train, pred)}", f"TEST\n{classification_report(y_test,predT)}")

TRAIN
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           1       0.99      0.98      0.99       400
           2       0.98      0.99      0.99       400
           3       0.99      0.99      0.99       400

    accuracy                           0.99      1600
   macro avg       0.99      0.99      0.99      1600
weighted avg       0.99      0.99      0.99      1600
 TEST
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       100
           1       0.97      0.99      0.98       100
           2       0.99      0.92      0.95       100
           3       0.94      1.00      0.97       100

    accuracy                           0.97       400
   macro avg       0.98      0.97      0.97       400
weighted avg       0.98      0.97      0.97       400



In [61]:
import pickle
with open("model.pkl", "wb") as f:
    pickle.dump(lg_final, f)

with open("pipeline.pkl", "wb") as f:
    pickle.dump(final_pipeline, f)
