In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [54]:
df_train = pd.read_csv('/kaggle/input/sa2022/train.csv')
df_test = pd.read_csv('/kaggle/input/sa2022/test.csv')

In [3]:
df_train.head()

In [4]:
df_train[df_train.maincateg.isna()]

In [5]:
df_test.head()

### Columns
* `title` - Name of the product
* `Rating` - average rating given to a product
* `maincateg` - category that the product is listed under(men/women)
* `platform` - platform on which it is sold on (Eg. Amazon, Flipkart)
* `price1` - Discounted Price of the listed product
* `actprice1` - Actual price of the listed product
* `Offer %` - Discount percent
* `norating1` - number of ratings available for a particular product
* `noreviews1` - number of reviews available for a particular product
* `star_5f` - number of five star ratings given to a particular product
* `star_4f` - number of four star ratings given to a particular product
* `star_3f` - number of three star ratings given to a particular product
* `star_2f` - number of two star ratings given to a particular product
* `star_1f` - number of one star ratings given to a particular product
* `fulfilled1` - whether it is Amazon fulfilled or not

### Keep in mind --- train data doesn't contain price1 and offer_% columns as the target is predicting the price of the comodity

In [6]:
print(df_train.shape)
print(df_test.shape)

In [7]:
df_test

In [8]:
df_train.dtypes

In [9]:
df_test.dtypes

In [10]:
df_train.isna().sum()

In [11]:
df_test.isna().sum()

In [12]:
import math
def fill_norating1(df):
    for ind in df.index:
        if math.isnan(df['norating1'][ind]):
            df['norating1'][ind] = df['star_1f'][ind] + df['star_2f'][ind] + df['star_3f'][ind] + df['star_4f'][ind] + df['star_5f'][ind] 

In [13]:
def fill_maincateg(df):
    for ind in df.index:
        string = df['maincateg'][ind]
        if string != string:
            categ = 'Unisex'
            if "Women" in df['title'][ind]:
                categ = 'Women'
            elif "Men" in df['title'][ind]:
                categ = 'Men'
                
            df['maincateg'][ind] = categ

In [55]:
# filling the nan values in df_train
df_train.star_5f.fillna(value=0 ,inplace=True)
df_train.star_4f.fillna(value=0, inplace=True)
df_train.star_3f.fillna(value=0, inplace=True)
#df_na = df_train[df_train.maincateg.isna()]
fill_maincateg(df_train)
#df_na = df_train[df_train.norating1.isna()]
fill_norating1(df_train)
df_train.noreviews1.fillna(value=0, inplace=True)

# filling the nan values in df_test
df_test.star_5f.fillna(value=0, inplace=True)
df_test.star_1f.fillna(value=0, inplace=True)
df_test.Rating.fillna(value=df_test.Rating.mean(), inplace=True)
df_test.maincateg.fillna(value=df_train.maincateg.mode()[0], inplace=True)

In [56]:
print(df_train.isna().sum())
print(df_test.isna().sum())

In [16]:
df_train.maincateg.value_counts()

In [57]:
# converting the star count variables to int64
df_train.star_5f = df_train.star_5f.astype('int64')
df_train.star_4f = df_train.star_4f.astype('int64')
df_train.star_3f = df_train.star_3f.astype('int64')
df_train.norating1 = df_train.norating1.astype('int64')
df_train.noreviews1 = df_train.noreviews1.astype('int64')

df_test.star_5f = df_test.star_5f.astype('int64')
df_test.star_1f = df_test.star_1f.astype('int64')

print('Train :\n',df_train.dtypes,sep='')
print('\nTest :\n',df_test.dtypes,sep='')

In [58]:
# converting the offer_% column to float
df_train['Offer %'] = df_train['Offer %'].str.replace('%','').astype('float64')

In [77]:
# selecting the non target columns
non_target_cols = [cols for cols in df_train.columns if cols not in ['price1','Offer %']]
non_target_cols

In [78]:
X = df_train[non_target_cols]
y = df_train['price1']

In [88]:
# one hot encoding the data
X_encoded = pd.get_dummies(X,columns=['maincateg','platform'])
X_encoded.drop(columns=['maincateg_Unisex'],axis=1,inplace=True)
df_test_encoded = pd.get_dummies(df_test,columns=['maincateg','platform'])

In [89]:
df_test_encoded.sample()

In [90]:
# exrracting the ids for submitting
df_test_ids = df_test.id
df_test_ids = np.array(df_ids)

In [91]:
#dropping title and id cols
X_encoded.drop(columns=['id','title'],inplace=True, axis=1)
df_test_encoded.drop(columns=['id','title'],inplace=True, axis=1)

In [92]:
df_test_encoded.sample()

In [93]:
# scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_encoded),columns=X_encoded.columns)
df_test_scaled = pd.DataFrame(scaler.fit_transform(df_test_encoded),columns=df_test_encoded.columns)

In [94]:
df_test_scaled.head()

In [95]:
# splitting the data into train and validation
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X_scaled,y,test_size=0.15)

In [96]:
print(X_train.shape)
print(X_val.shape)

In [97]:
# first using the basic linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)
lr_preds = lr_model.predict(X_val)
lr_mse = mean_squared_error(y_val,lr_preds)
print(lr_mse**0.5)

In [98]:
# submitting the linear regression model's output
lr_model.fit(X_val,y_val)
lr_preds_final =  lr_model.predict(df_test_scaled)

In [117]:
lr_sub = pd.DataFrame({'id':df_test_ids,'price1':lr_preds_final},columns=['id','price1'])
lr_sub.set_index('id',inplace=True)
lr_sub

In [105]:
lr_sub.to_csv('submission.csv')