## Settings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
FOLDER_PATH = r'/content/drive/MyDrive/MachineLearning-DANDL-20212/'
TRAIN_DATA_PATH = FOLDER_PATH + r'data/train.csv'
TEST_DATA_PATH = FOLDER_PATH + r'data/test.csv'

## Data Processing

In [None]:
os.path.exists(FOLDER_PATH), os.path.exists(TRAIN_DATA_PATH), os.path.exists(TEST_DATA_PATH)


(True, True, True)

In [None]:
df_train = pd.read_csv(TRAIN_DATA_PATH)
df_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [None]:
df_train.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [None]:
df_test = pd.read_csv(TEST_DATA_PATH)
df_test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     233599 non-null  int64  
 1   Product_ID                  233599 non-null  object 
 2   Gender                      233599 non-null  object 
 3   Age                         233599 non-null  object 
 4   Occupation                  233599 non-null  int64  
 5   City_Category               233599 non-null  object 
 6   Stay_In_Current_City_Years  233599 non-null  object 
 7   Marital_Status              233599 non-null  int64  
 8   Product_Category_1          233599 non-null  int64  
 9   Product_Category_2          161255 non-null  float64
 10  Product_Category_3          71037 non-null   float64
dtypes: float64(2), int64(4), object(5)
memory usage: 19.6+ MB


In [None]:
df_train_clean = df_train[['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']]
# df_train_clean = df_train_clean.astype({"User_ID": object})
df_train_clean.info()
# df_train_clean = pd.get_dummies(df_train_clean)
df_test_clean = df_test[['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']]
# df_test_clean = pd.get_dummies(df_test_clean)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   User_ID                     550068 non-null  int64 
 1   Product_ID                  550068 non-null  object
 2   Gender                      550068 non-null  object
 3   Age                         550068 non-null  object
 4   Occupation                  550068 non-null  int64 
 5   City_Category               550068 non-null  object
 6   Stay_In_Current_City_Years  550068 non-null  object
 7   Marital_Status              550068 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 33.6+ MB


In [None]:
df_test_clean.shape
df_train_clean.shape

(550068, 8)

In [None]:
check_point = df_train.shape[0]
check_point

550068

In [None]:
Y_train = df_train['Purchase']
Y_train.shape

(550068,)

In [None]:
df = pd.concat([df_train_clean, df_test_clean])

In [None]:
arr = pd.factorize(df['Product_ID'])[0] + 1

In [None]:
df['Product_ID'] = arr

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   User_ID                     783667 non-null  int64 
 1   Product_ID                  783667 non-null  int64 
 2   Gender                      783667 non-null  object
 3   Age                         783667 non-null  object
 4   Occupation                  783667 non-null  int64 
 5   City_Category               783667 non-null  object
 6   Stay_In_Current_City_Years  783667 non-null  object
 7   Marital_Status              783667 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 53.8+ MB


In [None]:
df = pd.get_dummies(df, columns=['Gender', 'Age', 'Occupation' , 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status'])

In [None]:
df.shape

(783667, 42)

In [None]:
X_train = df.iloc[:check_point, :]
X_test = df.iloc[check_point:, :]

In [None]:
X_train.shape

(550068, 42)

In [None]:
X_train_1, X_val, y_train_1, y_val = train_test_split(X_train, Y_train, test_size=0.3, random_state =0)

In [None]:
X_train_1.shape, X_val.shape

((385047, 42), (165021, 42))

## Modelling

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train_1, y_train_1)

LinearRegression()

In [None]:
y_result = model.predict(X_val)

In [None]:
y_val = y_val.values

In [None]:
np.sqrt(1 / y_val.shape[0] * np.sum((y_result - y_val) ** 2))

4851.161299511271

In [None]:
model.fit(X_train, Y_train)

LinearRegression()

In [None]:
result = model.predict(X_test)

## Submission

In [None]:
submission = pd.concat([pd.DataFrame(result, columns=['Purchase']), df_test[['User_ID', 'Product_ID']]] , axis = 1)
submission.head()

Unnamed: 0,Purchase,User_ID,Product_ID
0,9692.8883,1000004,P00128942
1,9118.139251,1000009,P00113442
2,8978.821074,1000010,P00288442
3,4813.98354,1000010,P00145342
4,9217.195681,1000011,P00053842


In [None]:
submission.to_csv('submission.csv', index=False)