## import Necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import pickle


: 

## import CSV file

In [None]:
df=pd.read_csv(r'F:\ADS\Web Scrapping\data_.csv')

## Data Cleaning

In [None]:
#1. Change the column's name
df.rename(columns={'Product_name.1':'Product_name','Sum of Product_Price':'Product_Price','Sum of RAM in (GB)':'RAM in (GB)','Sum of ROM in (GB)':'ROM in (GB)','Sum of Battery':'Battery','Sum of Reviews':'Reviews'},inplace=True)

In [None]:
#2. Check null values
df.isnull().sum()

In [None]:
#3. Here we found that we have 4 null values in Reviews column...to remove it---

df.fillna(df.Reviews.mode()[0],inplace=True)

# Note- We can also replace null value with mean
# df.fillna(df.Reviews.mean()[0],inplace=True)

In [None]:
#4. Add a column of Brand----
# Method 1.
df['Brand']=df.Product_name.str.split(expand=True)[0]

# Method 2.
'''
--Try by yourself--

def rep(col):
                            
    a=str(col).split()
    return a[0]

df.Product_name.apply(rep)

'''

In [None]:
#5. At the Index 7 the value of product "I Kall K570" is "I" in Column Brand...We have to replace it with "I Kall"
df.Brand.replace('I','I Kall',inplace=True)

In [None]:
# We have to replce "₹ " to ' ' , so we will not face any issue in model building.

def rem(col):
    return col.replace('₹ ',' ')

df.Product_Price=df.Product_Price.apply(rem)

In [None]:
#6. In Product_price column we have "," between values.. To remove it--
def coma(col):
    return int(str(col).replace(',',''))

df.Product_Price=df.Product_Price.apply(coma)

## Do some EDA(Exploratory data analysis)

In [None]:
# How many unique Brand we have--
df.Brand.unique()

In [None]:
# lets see the overview...
df.describe()

# Here we find that the min and max value of Battery and Reviews is inappropriate

In [None]:
# We want those Products whose reviews is smaller than 5--
df=df[(df.Reviews>=0) & (df.Reviews<=5.0)]

In [None]:
# We want those Products whose Battery is smaller or equal 2500 & greater or eqaul 8000--
df=df[(df.Battery>=2500) & (df.Battery<=8000)]


In [None]:
# count the value of each Brand--
df.value_counts('Brand')

In [None]:
# Find the count of products who have same price--
df.Product_Price.value_counts().sort_values(ascending=False)

In [None]:
# count of products who have same colour--
df.Product_colour.value_counts()

In [None]:
# Find how many product have price greater than 10,000--
df[df['Product_Price'] > 10000]




In [None]:
# Find how many product have Battery greater than 3000--

df[df['Battery'] > 3000]

In [None]:
# Create a new column which represent the category of Product by its Reviews like--"Bad","Avg","Good"
condtions=[
    (df['Reviews']<=2),
    (df['Reviews']>2) & (df['Reviews']<=3),
    (df['Reviews']>3) & (df['Reviews']<=5)    
]
qun=['Bad','Avg','Good']
df['Quality by Reviews']=np.select(condtions,qun)

In [None]:
# count of "Quality by Reviews"
df.value_counts('Quality by Reviews')

In [None]:
df

## Visualization & Relationship

In [None]:
# Relation b/w Battery and Product_price
plt.subplot(2,2,1)
plt.scatter(df['Battery'],df.Product_Price)
plt.xlabel('Battery')
plt.ylabel('Price')
plt.title('With scatter plot')

# Relation b/w ROM in (GB) and Product_price

plt.subplot(2,2,3)
plt.scatter(df['Reviews'],df.Product_Price)
plt.xlabel('Reviews')
plt.ylabel('Price')
plt.title('With scatter plot')

plt.subplot(2,2,4)
plt.bar(df['Reviews'],df.Product_Price)
plt.xlabel('Reviews')
plt.ylabel('Price')
plt.title('With Bar plot')

plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=2,
                    top=1,
                    wspace=0.4,
                    hspace=0.4)
plt.show()

In [None]:
# Relation b/w RAM in (GB) and Product_price
plt.subplot(2,2,1)
plt.scatter(df['RAM in (GB)'],df.Product_Price)
plt.xlabel('RAM in (GB)')
plt.ylabel('Price')
plt.title('With scatter plot')

plt.subplot(2,2,2)
plt.bar(df['RAM in (GB)'],df.Product_Price)
plt.xlabel('RAM in (GB)')
plt.ylabel('Price')
plt.title('With Bar plot')

# Relation b/w ROM in (GB) and Product_price

plt.subplot(2,2,3)
plt.scatter(df['ROM in (GB)'],df.Product_Price)
plt.xlabel('ROM in (GB)')
plt.ylabel('Price')
plt.title('With scatter plot')

plt.subplot(2,2,4)
plt.bar(df['ROM in (GB)'],df.Product_Price)
plt.xlabel('ROM in (GB)')
plt.ylabel('Price')
plt.title('With Bar plot')

plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=2,
                    top=1,
                    wspace=0.4,
                    hspace=0.4)
plt.show()

## Prepare models for price prediction

In [None]:
# train_test_split
train,test=train_test_split(df,test_size=0.20,random_state=0)

In [None]:
train

In [None]:
x_train=train.drop(['Product_name','Product_Price','Quality by Reviews'],axis=1)

In [None]:
y_train=train.Product_Price

In [None]:
x_test=test.drop(['Product_name','Product_Price','Quality by Reviews'],axis=1)

In [None]:
y_test=test.Product_Price

In [None]:
# Create an object of LabelEncoder
le=LabelEncoder()

In [None]:
# Convert the categorical values into numeric...
def convert(df,col):
    for i in col:
        df[i]=le.fit_transform(df[i])
        
    return df

In [None]:
convert(x_train,['Product_colour','Brand'])

In [None]:
convert(x_test,['Product_colour','Brand'])

In [None]:
# Create an object of LinearRegression...
lr=LinearRegression()

In [None]:
lr.fit(x_train,y_train)

In [None]:
pred=lr.predict(x_test)

In [None]:
pred

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
mean_squared_error(y_test,pred)

In [None]:
mean_absolute_error(y_test,pred)

## Model Deployement

In [None]:
pickle.dump(lr,open('Mobile_prize_prediction.pkl','wb'))

In [None]:
pickeled_model=pickle.load(open('Mobile_prize_prediction.pkl','rb'))

In [None]:
pickeled_model.predict([[11,3,32,5000,4.3,4]])