## Estimating Continous Values Using Linear Regression


> ### Artificial Intelligence - Final Computer Assginment
> ### Shakiba Bolbolian Khah - 810196426

* ### Libraries

In [1]:
from __future__ import unicode_literals
from hazm import *
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
pd.options.mode.chained_assignment = None
import math
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import datetime
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from unidecode import unidecode
import re
from scipy import stats
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.ensemble import RandomForestClassifier

* ### Preprocessing

> * **Deleting Outliners**

In [2]:
def deleteOutliners():
    global df
    df = df[((df['price'] == -1) | ((df['price'] > 40000) & (df['brand'] != 'Nokia::نوکیا')) |
            ((df['price'] > 30000) & (df['brand'] == 'Nokia::نوکیا'))) & (df['price'] < 5000000)]
#     z = np.abs(stats.zscore(df['price']))
#     threshold = 3
#     df = df[(z < 3)]

> * **Handling Categorical Features (Label and One Hot Encoding)**

In [3]:
def encodeData(type):
    global df
    if type == 'Label':
        labelencoder = LabelEncoder()
        
        df['brand cat'] = labelencoder.fit_transform(df['brand'])
        df['city cat'] = labelencoder.fit_transform(df['city'])

        global brandNames, cityNames
        for i in range(len(df['brand'])): 
            brandNames[df.iloc[i]['brand cat']] = df.iloc[i]['brand']
            cityNames[df.iloc[i]['city cat']] = df.iloc[i]['city']

        df = df.drop(columns= ['city','brand'])
        
    elif type == 'one hot':
        processedDf = pd.concat([df, pd.get_dummies(df['brand cat'])], axis=1)
        processedDf = processedDf.rename(columns= brandNames)
        processedDf = pd.concat([processedDf, pd.get_dummies(processedDf['city cat'])], axis=1)
        processedDf = processedDf.rename(columns= cityNames)
        processedDf = processedDf.drop(columns=['city cat', 'brand cat'])
        return processedDf

> * **Improving Persian Texts**

In [4]:
stopwords = list(string.punctuation) + ['گوشی','سلام','قیمت','تومان','تومن','موبایل','تلفن','همراه'
                                        ,'فوری','تخفیف','عجله','ممنون']+ stopwords_list()

def improvePersian(l):
    newList = []
    p = r'^[a-zA-Z0-9۱-۹]+'
    for w in l:
        if re.match(p, w) and not w.isnumeric():
                newW = unidecode(str(w))
                newList.append(newW.lower())
#         elif w not in stopwords:
#             newList.append(w)
            
    return newList

> * **Text Processing (Tokenizing, Stemming, Normalizing, etc)**

In [5]:
def extractBrand(b):
    return b.split(':')[0]
       
def stemData(l):
    ps = PorterStemmer()
#     lemmatizer = WordNetLemmatizer()
    t = []
    for i in l:
        t.append(ps.stem(i))
    return t
    
def preprocessTexts():
    global df
    normalizer = Normalizer()
    df['desc'] = df['desc'].apply(normalizer.affix_spacing)

    tokenizer = WordTokenizer()
    df['desc'] = df['desc'].apply(tokenizer.tokenize)
    df['title'] = df['title'].apply(tokenizer.tokenize)
    
    df['title'] = df['title'].apply(improvePersian)
    df['desc'] = df['desc'].apply(improvePersian)

    df['brand'] = df['brand'].apply(extractBrand)
    
    df['desc'] = df['desc'].apply(stemData)
    df['title'] = df['title'].apply(stemData)


> * **Date Processing**

In [6]:
times = {}

def preprocessDate():
    def isWeekend(s):
        weekDay = s.split()[0]
        if weekDay=='Saturday' or weekDay == 'Friday':
            return 1
        else:
            return 0
    df['is weekend'] = df['created_at'].apply(isWeekend)

> * **Applying Preprocessors (part I)**

In [7]:
df = pd.read_csv('./mobile_phone_dataset.csv', usecols = ['brand','city','title',
                                                          'desc','image_count','created_at','price'])

brandNames = {}
cityNames = {}

deleteOutliners()
preprocessTexts()
preprocessDate()
df = df.drop(columns= ['created_at'])
encodeData('Label')

> * **Calculating Correlation Between Different Price Column and Other Ones**

In [8]:
corr, _ = pearsonr(df['price'], df['brand cat'])
print(brandNames)
print("Correlation between brand and price:", corr)

{0: 'Apple', 1: 'HTC', 2: 'Huawei', 3: 'LG', 4: 'Lenovo', 5: 'Nokia', 6: 'Samsung', 7: 'Sony', 8: 'ZTE'}
Correlation between brand and price: -0.2941200888869938


In [9]:
corr, _ = pearsonr(df['price'], df['city cat'])
print(cityNames)
print("Correlation between city and price:", corr)

{0: 'Ahvaz', 1: 'Isfahan', 2: 'Karaj', 3: 'Kermanshah', 4: 'Mashhad', 5: 'Qom', 6: 'Shiraz', 7: 'Tabriz', 8: 'Tehran'}
Correlation between city and price: 0.09134597090831476


In [10]:
corr, _ = pearsonr(df['price'], df['is weekend'])
print("Correlation between weekend and price:", corr)

Correlation between weekend and price: -0.014327284995315989


In [11]:
corr, _ = pearsonr(df['price'], df['image_count'])
print("Correlation between image count and price:", corr)

Correlation between image count and price: 0.02673846216780569


> * **Applying Preprocessors (part II)**

In [12]:
regressionDF = encodeData('one hot')
regressionDF['text part'] = regressionDF['title'] + regressionDF['desc']
regressionDF = regressionDF.drop(columns = ['is weekend', 'image_count'])
regressionDF.head()

Unnamed: 0,title,desc,price,Apple,HTC,Huawei,LG,Lenovo,Nokia,Samsung,...,Ahvaz,Isfahan,Karaj,Kermanshah,Mashhad,Qom,Shiraz,Tabriz,Tehran,text part
0,[],[],60000,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,[]
1,[],[],1150000,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,[]
2,[j5],[],590000,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,[j5]
3,[5s],[],1100000,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,[5s]
4,"[galaxi, s5, gold]",[],900000,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,"[galaxi, s5, gold]"


> * **Splitting Data**

In [13]:
testData = regressionDF[regressionDF['price'] == -1]
trainData = regressionDF[regressionDF['price'] != -1]
randomState = 34
y = trainData['price']
X = trainData.drop(columns=['price', 'title', 'desc'])
yPredict = testData['price']
XPredict = testData.drop(columns=['price', 'title', 'desc'])

XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.20, random_state= randomState)

> * **Constructing Bag of Words**

In [14]:
wordsBag = {}

for i in XTrain['text part']:
    for j in i:
        if j in wordsBag:
            wordsBag[j] += 1
        else:
            wordsBag[j] = 1

In [15]:
remList = []
for i in wordsBag:
    num = wordsBag[i]
    if num < 150  or num > 2500 or i.isnumeric():
        remList.append(i)

d = [wordsBag.pop(key) for key in remList]   

> * **Cheking Existance of Bag of Words Data in Each Row**

In [16]:
def handleTexts(data):
    wordAttr = {k: [] for k in wordsBag}

    for t in data['text part']:
        for w in wordsBag:
            if w in t:
                wordAttr[w].append(1)
            else:
                wordAttr[w].append(0)
    return pd.DataFrame.from_dict(wordAttr)

In [17]:
def mergeDF(df1, df2):
    df1.is_copy = False
    df2.is_copy = False
    df1.loc[:,'idx'] = list(range(len(df1)))
    df2.loc[:,'idx'] = list(range(len(df2)))
    mergedDF = pd.merge(left= df1, right= df2, left_on='idx', right_on='idx')
    return mergedDF.drop(columns = ['idx', 'text part'])

* ### Processing

> * **Evaluation Function (MSE & MAE)**

In [18]:
yTestList = yTest.to_list()
yTrainList = yTrain.to_list()
yPredictIndex = list(yPredict.index)

def evaluateModel(predict, real, dataType):
    MSE = 0
    MAE = 0
    for i in range(len(predict)):
        MSE += math.pow(predict[i] - real[i] , 2)
        MAE += abs(predict[i] - real[i])

    MSE /= len(predict)
    MAE /= len(predict)
    print('MSE value for '+ dataType + ' data :' + '%.2f'%MSE)
    print('MAE value for '+ dataType + ' data :' + '%.2f'%MAE)
    print('-------------------------')

> * **Writing Predictions in File**

In [19]:
def writeInFile(predictDF):
    output = pd.DataFrame(columns = ['index', 'price'])
    
    for i in range(len(predictDF)):
        output = output.append({'index': yPredictIndex[i] , 'price': predictDF[i]}, ignore_index=True)
    output.to_csv(r'./output.csv', index = False, header=True)


> * **Model Construction, Training & Testing**

In [20]:
def trainTestPredictModel():
    trainTextDF = handleTexts(XTrain)
    XTrainFinal = mergeDF(XTrain, trainTextDF)
    linearReg = LinearRegression()
    reg = linearReg.fit(XTrainFinal, yTrain)
    print('Score of model: %.4f' %reg.score(XTrainFinal, yTrain))
    print('-------------------------')
    
    testTextDf = handleTexts(XTest)
    XTestFinal = mergeDF(XTest, testTextDf)
    yPredTest = reg.predict(XTestFinal)
    evaluateModel(yPredTest, yTestList, 'test')
    yPredTrain = reg.predict(XTrainFinal)
    evaluateModel(yPredTrain, yTrainList, 'train')
    
    predictTextDf = handleTexts(XPredict)
    XPredictFinal = mergeDF(XPredict, predictTextDf)
    yPredictOutput = reg.predict(XPredictFinal)
    writeInFile(yPredictOutput)
        
    
trainTestPredictModel()

Score of model: 0.5422
-------------------------
MSE value for test data :139992097758.22
MAE value for test data :269497.34
-------------------------
MSE value for train data :138151570208.64
MAE value for train data :268614.84
-------------------------
