## Imports

In [50]:
import numpy as np 
import pandas as pd
from scipy.sparse import hstack
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Dataset

In [51]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
print("Dimension of Training Dataframe: ", train_df.shape)
print("Dimension of Testing Dataframe: ", test_df.shape)

Dimension of Training Dataframe:  (1260154, 8)
Dimension of Testing Dataframe:  (222381, 7)


## Understanding the Data

In [52]:
train_df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_NAME,PRODUCT_CONDITION,CATEGORY,PRODUCT_BRAND,SHIPPING_AVAILABILITY,PRODUCT_DESCRIPTION,PRODUCT_PRICE
0,952289,Lipstick queen reserved maiwu,4,Beauty/Makeup/Lips,,0,Lipstick Queen Jean Queen and medieval lipstic...,20.0
1,121903,Garbage Pail Kids blu ray,3,Electronics/Media/Blu-Ray,,1,26Plays great. Tested. Watched once. No scratc...,15.0
2,280534,green floam/slime,1,Kids/Toys/Arts & Crafts,,0,"-6 oz, dragon fruit scented - KEEP OUT OF REAC...",8.0
3,787961,Wallet beige monogram,1,Women/Women's Accessories/Wallets,,0,Wallet brand new never used,31.0
4,479292,Triangle Bike Or Body Blue Light,3,Sports & Outdoors/Outdoors/Other,,0,"Triangle Bike Or Body Blue Light, steady or fl...",8.0


In [53]:
train_df.describe()

Unnamed: 0,PRODUCT_ID,PRODUCT_CONDITION,SHIPPING_AVAILABILITY,PRODUCT_PRICE
count,1260154.0,1260154.0,1260154.0,1260154.0
mean,740887.6,1.907482,0.4472382,26.72048
std,428013.6,0.903248,0.4972086,38.55894
min,0.0,1.0,0.0,0.0
25%,369931.2,1.0,0.0,10.0
50%,740868.5,2.0,0.0,17.0
75%,1111356.0,3.0,1.0,29.0
max,1482534.0,5.0,1.0,2009.0


## EDA

### Handling Missing Values

In [54]:
train_df.isna().sum()

PRODUCT_ID                    0
PRODUCT_NAME                  0
PRODUCT_CONDITION             0
CATEGORY                   5416
PRODUCT_BRAND            537885
SHIPPING_AVAILABILITY         0
PRODUCT_DESCRIPTION           3
PRODUCT_PRICE                 0
dtype: int64

In [55]:
test_df.isna().sum()

PRODUCT_ID                   0
PRODUCT_NAME                 0
PRODUCT_CONDITION            0
CATEGORY                   911
PRODUCT_BRAND            94797
SHIPPING_AVAILABILITY        0
PRODUCT_DESCRIPTION          1
dtype: int64

In [56]:
train_df['PRODUCT_BRAND'] = train_df['PRODUCT_BRAND'].fillna('local')
test_df['PRODUCT_BRAND'] = test_df['PRODUCT_BRAND'].fillna('local')

train_df['CATEGORY'] = train_df['CATEGORY'].fillna('general/general/general')
test_df['CATEGORY'] = test_df['CATEGORY'].fillna('general/general/general')

train_df['PRODUCT_DESCRIPTION'] = train_df['PRODUCT_DESCRIPTION'].fillna('no_description')
test_df['PRODUCT_DESCRIPTION'] = test_df['PRODUCT_DESCRIPTION'].fillna('no_description')

### Handling ID Column

In [57]:
train_df_ids = train_df['PRODUCT_ID']
train_df.drop(['PRODUCT_ID'], axis = 1, inplace = True)

test_df_ids = test_df['PRODUCT_ID']
test_df.drop(['PRODUCT_ID'], axis = 1, inplace = True)

### Handling Category Column

In [58]:
def split_category(text):
  try: return text.split("/")
  except: return ('general','general','general')

In [59]:
# Training Dataset
train_df['Main_Category'], train_df['Sub_Category_1'], train_df['Sub_Category_2'] = \
zip(*train_df['CATEGORY'].apply(lambda x: split_category(x)))
train_df.drop(['CATEGORY'], axis = 1, inplace = True)

# Testing Dataset
test_df['Main_Category'], test_df['Sub_Category_1'], test_df['Sub_Category_2'] = \
zip(*test_df['CATEGORY'].apply(lambda x: split_category(x)))
test_df.drop(['CATEGORY'], axis = 1, inplace = True)

### Handling Negative Product Price

In [60]:
print('Removed {} rows' .format(len(train_df[train_df.PRODUCT_PRICE<=0])))
train_df = train_df[train_df.PRODUCT_PRICE > 0].reset_index(drop=True)

Removed 740 rows


### Encoding Brand and Category Columns

In [61]:
# Colums for text processing : [PRODUCT_BRAND, Main_Category, Sub_Category_1. Sub_Category_2]
vectorizer = CountVectorizer(lowercase=False, binary=True)


train_brand_oneHot = vectorizer.fit_transform(train_df['PRODUCT_BRAND'].values)
test_brand_oneHot = vectorizer.transform(test_df['PRODUCT_BRAND'].values)

train_main_cat_oneHot = vectorizer.fit_transform(train_df['Main_Category'].values)
test_main_cat_oneHot = vectorizer.transform(test_df['Main_Category'].values)

train_sub_cat_1_oneHot = vectorizer.fit_transform(train_df['Sub_Category_1'].values)
test_sub_cat_1_oneHot = vectorizer.transform(test_df['Sub_Category_1'].values)

train_sub_cat_2_oneHot = vectorizer.fit_transform(train_df['Sub_Category_2'].values)
test_sub_cat_2_oneHot = vectorizer.transform(test_df['Sub_Category_2'].values)

### Text Processing

#### Processing Category Column

In [62]:
def category_process(text):
  text = text.str.replace('[^a-zA-Z0-9]', ' ')
  text = text.str.replace(' ', '')
  text = text.str.replace('&', '_')
  return text

In [63]:
# Colums for category processing : [Main_Category, Sub_Category_1. Sub_Category_2]

# Training Dataset
train_df['Main_Category'] = category_process(train_df['Main_Category'])
train_df['Sub_Category_1'] = category_process(train_df['Sub_Category_1'])
train_df['Sub_Category_2'] = category_process(train_df['Sub_Category_2'])

# Testing Dataset
test_df['Main_Category'] = category_process(test_df['Main_Category'])
test_df['Sub_Category_1'] = category_process(test_df['Sub_Category_1'])
test_df['Sub_Category_2'] = category_process(test_df['Sub_Category_2'])


  text = text.str.replace('[^a-zA-Z0-9]', ' ')


#### Processing Name, Description Column

In [64]:
def text_process(text):
  text = text.str.replace('[^a-zA-Z0-9]', ' ')
  text = text.str.replace('\\"', ' ')
  text = text.str.replace('\\r', ' ')
  text = text.str.replace('\\n', ' ')
  stop_words = stopwords.words('english')
  text = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
  text = text.map(lambda x: x.lower())
  return text

In [65]:
# Colums for text processing : [PRODUCT_DESCRIPTION, PRODUCT_NAME]

# Training Dataset
train_df['PRODUCT_NAME'] = text_process(train_df['PRODUCT_NAME'])
train_df['PRODUCT_DESCRIPTION'] = text_process(train_df['PRODUCT_DESCRIPTION'])

# Testing Dataset
test_df['PRODUCT_NAME'] = text_process(test_df['PRODUCT_NAME'])
test_df['PRODUCT_DESCRIPTION'] = text_process(test_df['PRODUCT_DESCRIPTION'])


  text = text.str.replace('[^a-zA-Z0-9]', ' ')
  text = text.str.replace('\\"', ' ')
  text = text.str.replace('\\r', ' ')
  text = text.str.replace('\\n', ' ')


In [66]:
train_df.head()
print(train_df.shape)

(1259414, 9)


In [67]:
test_df.head()
print(test_df.shape)

(222381, 8)


### Tokenization

In [68]:
# Tokenization needed for columns: [PRODUCT_DESCRIPTION, PRODUCT_NAME]

# Training Dataset
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=3, max_features= 70000)
train_name_tfidf = vectorizer.fit_transform(train_df['PRODUCT_NAME'].values)
test_name_tfidf = vectorizer.transform(test_df['PRODUCT_NAME'].values)

# Testing Dataset
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features= 200000)
train_description_tfidf = vectorizer.fit_transform(train_df['PRODUCT_DESCRIPTION'].values)
test_description_tfidf = vectorizer.transform(test_df['PRODUCT_DESCRIPTION'].values)

In [69]:
train_df.head()
print(train_df.shape)

(1259414, 9)


In [70]:
test_df.head()
print(test_df.shape)

(222381, 8)


## Sparse Matrix Creation

In [71]:
# Traning Dataset
Y_train = train_df['PRODUCT_PRICE'].to_numpy()
train_df.drop(['PRODUCT_PRICE'], axis = 1, inplace = True)
Y_train = Y_train.reshape((-1,1))
Y_Train = np.log1p(Y_train)      

train_sparse = hstack((train_brand_oneHot, train_main_cat_oneHot, train_sub_cat_1_oneHot, train_sub_cat_2_oneHot,
                       train_name_tfidf, train_description_tfidf)).tocsr()
new_train_df = train_df.drop(columns = ['PRODUCT_NAME', 'PRODUCT_BRAND','PRODUCT_DESCRIPTION','Main_Category','Sub_Category_1',
                                        'Sub_Category_2'], axis = 1, inplace = False)
X_train = hstack((new_train_df.values, train_sparse)).tocsr()

# Testing Dataset
test_sparse = hstack((test_brand_oneHot, test_main_cat_oneHot, test_sub_cat_1_oneHot, test_sub_cat_2_oneHot,
                       test_name_tfidf, test_description_tfidf)).tocsr()
new_test_df = test_df.drop(columns = ['PRODUCT_NAME', 'PRODUCT_BRAND','PRODUCT_DESCRIPTION','Main_Category','Sub_Category_1',
                                        'Sub_Category_2'], axis = 1, inplace = False)
X_test = hstack((new_test_df.values, test_sparse)).tocsr()

print(X_train.shape)
print(X_test.shape)


(1259414, 276163)
(222381, 276163)


## Splitting Training Dataset into : (Training and Validation)

In [72]:
training_X, validation_X, training_Y, validation_Y = train_test_split(X_train, Y_Train, test_size=0.2, random_state=17)


## Performance Metrics

In [73]:
def RMSLE(Y_True, Y_Prediction):
    assert len(Y_True) == len(Y_Prediction)
    score = np.sqrt(np.mean(np.power(np.log1p(Y_Prediction) - np.log1p(Y_True), 2)))
    return score

## Learning Model: Linear Regression 

In [74]:
svd = TruncatedSVD(n_components= 10, random_state=42)
svd.fit(X_train)
svd_X_train = svd.fit_transform(X_train)
svd_X_test = svd.transform(X_test)
X_train

<1259414x276163 sparse matrix of type '<class 'numpy.float64'>'
	with 55403957 stored elements in Compressed Sparse Row format>

In [75]:
training_X_svd, validation_X_svd, training_Y_svd, validation_Y_svd = train_test_split(svd_X_train, Y_Train, test_size=0.2, random_state=42)

In [76]:
model = LinearRegression().fit(training_X_svd, training_Y_svd)

valid_Y_pred = model.predict(validation_X_svd)
valid_Y_pred = np.expm1(valid_Y_pred)
valid_Y_true = np.expm1(validation_Y_svd)
print("Score on Validation Data: ", RMSLE(valid_Y_true, valid_Y_pred))

Score on Validation Data:  0.6937251906001094


## Learning Model: Random Forest Regressor

In [77]:
model = RandomForestRegressor(max_depth=2, random_state=0)
model.fit(training_X_svd[:25000], training_Y_svd[:25000])

valid_Y_pred = model.predict(validation_X_svd[:25000])
valid_Y_pred = np.expm1(valid_Y_pred[:25000])
valid_Y_true = np.expm1(validation_Y_svd[:25000])
print("Score on Validation Data: ", RMSLE(valid_Y_true, valid_Y_pred))

  model.fit(training_X_svd[:25000], training_Y_svd[:25000])


Score on Validation Data:  0.772981377982947


## Learning Model: RIDGE Regression (selecting best Alpha)

In [78]:
alpha = [1, 2, 3, 3.5, 4, 5] 
rmsle_array = [] 
for i in alpha:
    
    model = Ridge (solver="auto", random_state=42, alpha=i)
    model.fit(training_X, training_Y)
    
    validation_Y_pred = model.predict(validation_X)
    validation_Y_pred = np.expm1(validation_Y_pred)
    validation_Y_true = np.expm1(validation_Y)
    
    rmsle_array.append(RMSLE(validation_Y_true, validation_Y_pred))

for i in range(len(rmsle_array)):
    print ('RMSLE for alpha = ',alpha[i],'is',rmsle_array[i])


best_alpha = alpha[rmsle_array.index(min(rmsle_array))]
print('Best Alpha: ', alpha[rmsle_array.index(min(rmsle_array))])



RMSLE for alpha =  1 is 0.4516403195636112
RMSLE for alpha =  2 is 0.44775052890540845
RMSLE for alpha =  3 is 0.44678883380807893
RMSLE for alpha =  3.5 is 0.4467248999375962
RMSLE for alpha =  4 is 0.44685573773551496
RMSLE for alpha =  5 is 0.4472955001022651
Best Alpha:  3.5


In [79]:
alpha = [3.25, 3.30, 3.35, 3.45] 
rmsle_array = [] 
for i in alpha:
    
    model = Ridge (solver="auto", random_state=42, alpha=i)
    model.fit(training_X, training_Y)
    
    validation_Y_pred = model.predict(validation_X)
    validation_Y_pred = np.expm1(validation_Y_pred)
    validation_Y_true = np.expm1(validation_Y)
    
    rmsle_array.append(RMSLE(validation_Y_true, validation_Y_pred))

for i in range(len(rmsle_array)):
    print ('RMSLE for alpha = ',alpha[i],'is',rmsle_array[i])


best_alpha = alpha[rmsle_array.index(min(rmsle_array))]
print('Best Alpha: ', alpha[rmsle_array.index(min(rmsle_array))])

RMSLE for alpha =  3.25 is 0.4467329070474152
RMSLE for alpha =  3.3 is 0.44672518340958595
RMSLE for alpha =  3.35 is 0.4467410094560818
RMSLE for alpha =  3.45 is 0.44672542968910456
Best Alpha:  3.3


## Final Model

In [80]:
model = Ridge(solver="auto", random_state=42, alpha= best_alpha)
model = model.fit(X_train, Y_Train)

### Handling Negative Price Prediction 

In [81]:
Y_test = np.where(model.predict(X_test) < 0, 0, model.predict(X_test))
Y_test = np.expm1(Y_test)

In [82]:
valid_Y_pred = model.predict(validation_X)
valid_Y_pred = np.expm1(valid_Y_pred)
valid_Y_true = np.expm1(validation_Y)
print("Score on Validation Data: ", RMSLE(valid_Y_true, valid_Y_pred))

Score on Validation Data:  0.40792654741932827


### Submission .csv Creation

In [83]:
submission_df = pd.DataFrame().assign(PRODUCT_ID = test_df_ids, PRODUCT_PRICE= Y_test)
print(submission_df)
submission_df.to_csv('Submission.csv')


        PRODUCT_ID  PRODUCT_PRICE
0           777341       9.319494
1          1463629      25.483724
2           350669      12.915736
3           310222       5.334498
4           759257      28.090350
...            ...            ...
222376      491588      38.722458
222377     1410603      22.706917
222378      207926      10.072652
222379      241496      13.246577
222380      563849       7.726961

[222381 rows x 2 columns]
