# Objective: Predict the price of phone based on its features

In [14]:
# libraries imports
# from google.colab import files  # uncomment this line if working with google colab
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split  # for splitting the data
from sklearn.preprocessing import StandardScaler  # preprosessor that eliminates data bias
from sklearn.neighbors import KNeighborsClassifier  # The main classification algorithm
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score  # 

%matplotlib notebook


# Data upload and information gathering

In [15]:
# Uploading dataset as below if working with google colab
# data = files.upload()

In [16]:
dataset = pd.read_csv("phones.csv")

In [17]:
len(dataset)

2000

In [24]:
dataset.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [25]:
dataset.tail()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0
1999,510,1,2.0,1,5,1,45,0.9,168,6,...,483,754,3919,19,4,2,1,1,1,3


In [57]:
df = pd.DataFrame(dataset)

In [58]:
row_columns = df.shape

In [59]:
row_columns

(2000, 21)

In [60]:
# Detailed information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   float64
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   float64
 4   fc             2000 non-null   float64
 5   four_g         2000 non-null   float64
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   float64
 11  px_height      2000 non-null   float64
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   float64
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   float64
 18  touch_sc

# Data Visualization

In [29]:
# Data correlation between columns
dataset[["battery_power", "blue", "clock_speed", "dual_sim", "fc", "four_g", "int_memory", 
        "m_dep", "mobile_wt", "n_cores", "pc", "px_height", "px_width" ,"ram" ,"sc_h" ,"sc_w", "talk_time" ,
        "three_g" ,"touch_screen" ,"wifi"]].corr()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
battery_power,1.0,0.011252,0.011482,-0.041847,0.033334,0.015665,-0.004004,0.034085,0.001844,-0.029727,0.031441,0.014901,-0.008402,-0.000653,-0.029959,-0.021421,0.05251,0.011522,-0.010516,-0.008343
blue,0.011252,1.0,0.021419,0.035198,0.003593,0.013443,0.041177,0.004049,-0.008605,0.036161,-0.009952,-0.006872,-0.041533,0.026351,-0.002952,0.000613,0.013934,-0.030236,0.010061,-0.021863
clock_speed,0.011482,0.021419,1.0,-0.001315,-0.000434,-0.043073,0.006545,-0.014364,0.01235,-0.005724,-0.005245,-0.014523,-0.009476,0.003443,-0.029078,-0.007378,-0.011432,-0.046433,0.019756,-0.024471
dual_sim,-0.041847,0.035198,-0.001315,1.0,-0.029123,0.003187,-0.015679,-0.022142,-0.008979,-0.024658,-0.017143,-0.020875,0.014291,0.041072,-0.011949,-0.016666,-0.039404,-0.014008,-0.017117,0.02274
fc,0.033334,0.003593,-0.000434,-0.029123,1.0,-0.01656,-0.029133,-0.001791,0.023618,-0.013356,0.644595,-0.00999,-0.005176,0.015099,-0.011014,-0.012373,-0.006829,0.001793,-0.014828,0.020085
four_g,0.015665,0.013443,-0.043073,0.003187,-0.01656,1.0,0.00869,-0.001823,-0.016537,-0.029706,-0.005598,-0.019236,0.007448,0.007313,0.027166,0.037005,-0.046628,0.584246,0.016758,-0.01762
int_memory,-0.004004,0.041177,0.006545,-0.015679,-0.029133,0.00869,1.0,0.006886,-0.034214,-0.02831,-0.033273,0.010441,-0.008335,0.032813,0.037771,0.011731,-0.00279,-0.009366,-0.026999,0.006993
m_dep,0.034085,0.004049,-0.014364,-0.022142,-0.001791,-0.001823,0.006886,1.0,0.021756,-0.003504,0.026282,0.025263,0.023566,-0.009434,-0.025348,-0.018388,0.017003,-0.012065,-0.002638,-0.028353
mobile_wt,0.001844,-0.008605,0.01235,-0.008979,0.023618,-0.016537,-0.034214,0.021756,1.0,-0.018989,0.018844,0.000939,9e-05,-0.002581,-0.033855,-0.020761,0.006209,0.001551,-0.014368,-0.000409
n_cores,-0.029727,0.036161,-0.005724,-0.024658,-0.013356,-0.029706,-0.02831,-0.003504,-0.018989,1.0,-0.001193,-0.006872,0.02448,0.004868,-0.000315,0.025826,0.013148,-0.014733,0.023774,-0.009964


In [52]:
np.random.seed(0)
plt.style.use('ggplot')

In [53]:
X, y = dataset.data, dataset.target

AttributeError: 'DataFrame' object has no attribute 'data'

# Data  cleaning

In [30]:
eliminate_null_zero = ["battery_power", "blue", "clock_speed", "dual_sim", "fc", "four_g", "int_memory", 
        "m_dep", "mobile_wt", "n_cores", "pc", "px_height", "px_width" ,"ram" ,"sc_h" ,"sc_w", "talk_time" ,
        "three_g" ,"touch_screen" ,"wifi"]

# Check columns before elimination of zero values
print(dataset["blue"])

0       0
1       1
2       1
3       1
4       1
       ..
1995    1
1996    1
1997    0
1998    0
1999    1
Name: blue, Length: 2000, dtype: int64


In [32]:
for column in eliminate_null_zero:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

# Check columns after elimination of zero values
print(dataset["blue"])

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
1995    1.0
1996    1.0
1997    1.0
1998    1.0
1999    1.0
Name: blue, Length: 2000, dtype: float64


# Needed Rows for Model Training

In [None]:
needed_rows = ["battery_power", "blue", "clock_speed", "dual_sim", "fc", "four_g", "int_memory", 
        "m_dep", "mobile_wt", "n_cores", "pc", "px_height", "px_width" ,"ram" ,"sc_h" ,"sc_w", "talk_time" ,
        "three_g" ,"touch_screen"]

# Model Training

In [33]:
# Split data into training and testing
X = dataset.iloc[:, 0:8]  # Decide the number of columns upto but not including the column
y = dataset.iloc[:, 8]  # Focus is on the target column alone
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [35]:
# FEATURE SCALING
# Nb: For any algo that computes distance or assumes normality, scale your features
# We usually don't train the y_train & y_test, only the data going in

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


In [36]:
# Define the Model: K-NN
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean') # we are using eucladian to measure the distance
# n_neigbors is the squareroot of len(y)

In [37]:
# Model Fitting
classifier.fit(X_train, y_train)

In [39]:
# Predict the test set results
y_pred = classifier.predict(X_test)

In [40]:
y_pred

array([ 84,  81,  85,  82,  95,  86,  93,  83, 103,  80, 125,  82, 182,
       129,  81, 112,  84,  94, 108, 153,  95, 197, 124, 108, 120,  90,
       100,  83, 133, 149,  85,  84,  85, 105,  80, 100,  81, 185,  80,
       181,  86, 110,  95,  80,  83, 186,  81,  86,  94,  88,  98,  80,
       105,  83,  83,  80, 114,  88,  85, 141, 152, 131,  80,  84, 158,
        81, 198, 102, 106,  81, 120,  80,  80,  91, 100,  86,  81, 130,
       162, 115, 105, 138, 101,  85, 119,  80, 125,  95,  80, 169, 200,
        86, 111,  90,  86,  91, 104, 120, 104, 115, 129,  85,  88, 136,
        85, 150,  88,  94,  82, 181,  80,  84,  84, 185,  88, 140,  82,
       108, 157,  81,  81,  88,  99,  84,  80, 147,  90,  83, 181, 154,
        84,  80,  87, 176, 154,  80, 100,  85,  93, 124,  84,  84, 127,
        91, 133,  81, 164,  80,  83,  82,  84,  82,  95, 160,  92,  83,
        89,  82,  85, 105, 158,  84, 101,  82,  81, 115, 158,  90, 105,
       163,  94,  84,  86,  87,  84,  95,  84,  81,  95, 101,  8

In [41]:
# Model Evaluation Using Confusion matrix
confusion_mtrx = confusion_matrix(y_test, y_pred)

In [49]:
confusion_mtrx

array([[1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 2, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [50]:
print(f1_score(y_test, y_pred))

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [51]:
# Determining the accuracy
print(accuracy_score(y_test, y_pred))

0.01
