In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from warnings import filterwarnings
filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Reading Dataset

In [None]:
data = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')
print(data.shape)
data.head()

In [None]:
data.info()

In [None]:
data.describe()

## 2. Exploratory Data Analysis

In [None]:
sns.countplot(data['price_range'])

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(data.corr(),annot=True,square=True)

In [None]:
corr = data.corr()
Num = corr['price_range'].sort_values(ascending=False).to_frame()
cm = sns.light_palette('cyan',as_cmap=True)
s = Num.style.background_gradient(cmap=cm)
s

In [None]:
plt.figure(figsize=(14,6))

plt.subplot(2,2,1)
sns.barplot(x='price_range',y='battery_power',data=data,palette='Reds')
plt.subplot(2,2,2)
sns.barplot(x='price_range',y='px_height',data=data,palette='Blues')
plt.subplot(2,2,3)
sns.barplot(x='price_range',y='px_width',data=data,palette='Greens')
plt.subplot(2,2,4)
sns.barplot(x='price_range',y='ram',data=data,palette='Oranges')

In [None]:
sns.relplot(x='price_range',y='ram',data=data,kind='line')

## 3. Feature Engineering

### (A) Feature Scaling

In [None]:
from sklearn.model_selection import train_test_split

features = data.drop(columns=['price_range'])
target = data['price_range']
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=0)

In [None]:
cols = features.columns

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train,columns=[cols])
X_test = pd.DataFrame(X_test,columns=[cols])

### (B) Feature Selection

#### Recursive feature elimination (RFE) with random forest

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()      
rfe = RFE(estimator=clf_rf,n_features_to_select=5,step=1)
rfe = rfe.fit(X_train,y_train)

In [None]:
print('Chosen best 5 feature by rfe:',X_train.columns[rfe.support_])

In [None]:
pick_col = ['battery_power','mobile_wt','px_height','px_width','ram']
X_train_pick = X_train[pick_col]
X_test_pick = X_test[pick_col]

In [None]:
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train_pick,y_train)
y_pred1 = rfc.predict(X_test_pick)

acc = accuracy_score(y_test,y_pred1)
print('Accuracy is: ',acc)
cm = confusion_matrix(y_test,y_pred1)
sns.heatmap(cm,annot=True,fmt="d",cmap='YlGnBu')

#### Recursive feature elimination with cross validation(RFECV) and random forest classification

In [None]:
from sklearn.feature_selection import RFECV

clf_rf2 = RandomForestClassifier() 
rfecv = RFECV(estimator=clf_rf2,step=1,cv=5,scoring='accuracy') 
rfecv = rfecv.fit(X_train,y_train)

print('Optimal number of features :',rfecv.n_features_)
print('Best features :',X_train.columns[rfecv.support_])

Let's look at best accuracy in plot.

In [None]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1,len(rfecv.grid_scores_)+1),rfecv.grid_scores_)
plt.show()

In [None]:
select_col = ['battery_power','px_height','px_width','ram']
X_train_selected = X_train[select_col]
X_test_selected = X_test[select_col]

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train_selected,y_train)
y_pred2 = rfc.predict(X_test_selected)

acc = accuracy_score(y_test,y_pred2)
print('Accuracy is: ',acc)
cm = confusion_matrix(y_test,y_pred2)
sns.heatmap(cm,annot=True,fmt="d",cmap='YlGn')

We will use the features selected by RFECV to build the model.

## 4. Building Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [None]:
def score_of_model(models,X_train,X_test,y_train,y_test):
    np.random.seed(0)
    
    model_scores = {}
    
    for name, model in models.items():
        model.fit(X_train,y_train)
        model_scores[name] = model.score(X_test,y_test)

    model_scores = pd.DataFrame(model_scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score')
        
    return model_scores

In [None]:
models = {'LogisticRegression': LogisticRegression(max_iter=10000),
          'KNeighborsClassifier': KNeighborsClassifier(),
          'SVC': SVC(),
          'DecisionTreeClassifier': DecisionTreeClassifier(),
          'RandomForestClassifier': RandomForestClassifier(),
          'XGBClassifier': XGBClassifier()}

In [None]:
model_score = score_of_model(models,X_train_selected,X_test_selected,y_train,y_test)

In [None]:
cm = sns.color_palette('coolwarm',as_cmap=True)
score = model_score.style.background_gradient(cmap=cm)
score

How we see - SVC gives the best results

## 5. Model Evalution

#### Run SVM with default hyperparameters 

In [None]:
# instantiate classifier with default hyperparameters
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svc = SVC() 
svc.fit(X_train_selected,y_train)
y_pred = svc.predict(X_test_selected)
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test,y_pred)))

#### Run SVM with rbf kernel and C=100.0

In [None]:
# instantiate classifier with rbf kernel and C=100
svc100 = SVC(C=100.0) 
svc100.fit(X_train_selected,y_train)
y_pred = svc100.predict(X_test_selected)
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

#### Run SVM with linear kernel 

In [None]:
# instantiate classifier with linear kernel and C=1.0
linear_svc = SVC(kernel='linear',C=1.0) 
linear_svc.fit(X_train_selected,y_train)
y_pred = linear_svc.predict(X_test_selected)
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_test,y_pred)))

#### Run SVM with polynomial kernel

In [None]:
# instantiate classifier with polynomial kernel and C=1.0
poly_svc = SVC(kernel='poly',C=1.0) 
poly_svc.fit(X_train_selected,y_train)
y_pred = poly_svc.predict(X_test_selected)
print('Model accuracy score with polynomial kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_test,y_pred)))

#### Run SVM with sigmoid kernel 

In [None]:
# instantiate classifier with sigmoid kernel and C=1.0
sigmoid_svc = SVC(kernel='sigmoid',C=1.0) 
sigmoid_svc.fit(X_train_selected,y_train)
y_pred = sigmoid_svc.predict(X_test_selected)
print('Model accuracy score with sigmoid kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

We get maximum accuracy with **rbf kernel** with C=1.0 and the accuracy is **0.9675**. Based on the above analysis we can conclude that our classification model accuracy is very good. 

In [None]:
from sklearn.metrics import classification_report,plot_confusion_matrix 

In [None]:
model = SVC() 
model.fit(X_train_selected,y_train)
y_pred = model.predict(X_test_selected)
print(classification_report(y_test,y_pred))

In [None]:
plot_confusion_matrix(model,X_test_selected,y_test,cmap='OrRd')

**Conclusion:**

The chosen model was SVC since it´s the most accurate, and got a perfecf accuracy.

In this project, we use RFECV to select the features.