In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostRegressor
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('./Amazon_Unlocked_Mobile.csv')

In [3]:
data['Rating'].std()

1.5482158148665002

In [4]:
data_train, data_test = train_test_split(data, test_size=0.25)

In [5]:
data_train

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
33234,Apple iPhone 5c 32GB (Yellow) - AT&T,Apple,224.77,3,had to return was not unlocked,0.0
263376,Nokia C3-00 Unlocked Cell Phone (Slate) with Q...,Nokia,49.99,5,"my contract gonna end next summer, so I need a...",5.0
277119,Nokia N73 Unlocked Genuine Smart Music Mobile ...,Nokia,151.44,1,It never booited up taught it was the battery ...,0.0
411077,X-shop 5.5-inch Unlocked White Lenovo A850 3g ...,,119.99,4,"There are a problem, the menu on the PC is in ...",
130845,BLU PURE XL Smartphone - 4G LTE GSM Unlocked -...,BLU,134.95,4,Worth every penny. Great picture quality. Snap...,1.0
...,...,...,...,...,...,...
333887,Samsung Galaxy S Duos II GT-S7582 Factory Unlo...,,280.00,5,"Nice cellphone, came with everything.",0.0
110053,BLU Dash JR 4.0 K Smartphone - Unlocked - Blue,BLU,99.99,4,muy bueno,0.0
203671,Huawei U8833/Y300 Android 4.1 Dual Core 1.0GHz...,,99.00,5,cool,0.0
350160,Samsung Galaxy S5 Mini G800F 16GB 4G LTE Unloc...,,299.00,5,Great phone,0.0


# Basic data reformatting

In [6]:
renamer = {
    'Brand Name': 'Brand',
    'Price': 'Price',
    'Rating': 'Rating',
    'Reviews': 'Review',
    'Review Votes': 'Upvotes'
}
data_train = data_train[renamer.keys()].rename(columns=renamer)
data_test = data_test[renamer.keys()].rename(columns=renamer)

In [7]:
data_train.reset_index(drop=True, inplace=True)
data_test.reset_index(drop=True, inplace=True)

In [8]:
def price_rounder(price):
    return (price // 5) * 5 + 5 * (price % 5 >= 2.5)
data_train['Price'] = data_train['Price'].apply(price_rounder)
data_test['Price'] = data_test['Price'].apply(price_rounder)

In [9]:
data_train

Unnamed: 0,Brand,Price,Rating,Review,Upvotes
0,Apple,225.0,3,had to return was not unlocked,0.0
1,Nokia,50.0,5,"my contract gonna end next summer, so I need a...",5.0
2,Nokia,150.0,1,It never booited up taught it was the battery ...,0.0
3,,120.0,4,"There are a problem, the menu on the PC is in ...",
4,BLU,135.0,4,Worth every penny. Great picture quality. Snap...,1.0
...,...,...,...,...,...
310375,,280.0,5,"Nice cellphone, came with everything.",0.0
310376,BLU,100.0,4,muy bueno,0.0
310377,,100.0,5,cool,0.0
310378,,300.0,5,Great phone,0.0


# EDA

In [10]:
data_train.isna().sum()

Brand      48988
Price       4481
Rating         0
Review        54
Upvotes     9251
dtype: int64

In [11]:
data_test.isna().sum()

Brand      16183
Price       1452
Rating         0
Review         8
Upvotes     3045
dtype: int64

In [12]:
critical_columns = ['Rating', 'Review', 'Upvotes']
data_train.dropna(subset=critical_columns, inplace=True)
data_test.dropna(subset=critical_columns, inplace=True)

In [13]:
data_train.isna().sum()

Brand      47469
Price       4338
Rating         0
Review         0
Upvotes        0
dtype: int64

In [14]:
data_test.isna().sum()

Brand      15696
Price       1419
Rating         0
Review         0
Upvotes        0
dtype: int64

In [15]:
data_train.to_csv('./train.csv', index=False)
data_test.drop(columns='Rating').to_csv('./test.csv', index=False)
data_test['Rating'].to_csv('./answers.csv', index=False, header=False)
(data_test['Rating']*0).to_csv('./sample_submission.csv', index=False, header=False)

In [16]:
pd.read_csv('./answers.csv', header=None).std() ** 2

0    2.387493
dtype: float64