In [1]:
import pandas as pd
import numpy as np


In [2]:
train_data=pd.read_csv('/content/train[1].csv')
test_data=pd.read_csv('/content/test[1].csv')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [4]:
train_data.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [5]:
train_data.head(5)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


## Check For Missing Values

In [6]:
train_data.isnull().sum()

Unnamed: 0,0
User_ID,0
Product_ID,0
Gender,0
Age,0
Occupation,0
City_Category,0
Stay_In_Current_City_Years,0
Marital_Status,0
Product_Category_1,0
Product_Category_2,173638


In [7]:
train_data.fillna(0,inplace=True)

## Feature Engineering

In [8]:
train_data = pd.get_dummies(train_data, columns=['Gender', 'Age', 'City_Category'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Gender', 'Age', 'City_Category'], drop_first=True)


## Featutre Selection

In [9]:
X_train = train_data.drop(columns=['User_ID', 'Product_ID', 'Purchase'])
y_train = train_data['Purchase']


## Model Selection and Training

In [10]:
print(X_train.dtypes)


Occupation                      int64
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Gender_M                         bool
Age_18-25                        bool
Age_26-35                        bool
Age_36-45                        bool
Age_46-50                        bool
Age_51-55                        bool
Age_55+                          bool
City_Category_B                  bool
City_Category_C                  bool
dtype: object


In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer



In [12]:
print(train_data['Stay_In_Current_City_Years'].unique())


['2' '4+' '3' '1' '0']


In [13]:
# Convert '4+' to '4' and then to integer
train_data['Stay_In_Current_City_Years'] = train_data['Stay_In_Current_City_Years'].replace('4+', '4').astype(int)
test_data['Stay_In_Current_City_Years'] = test_data['Stay_In_Current_City_Years'].replace('4+', '4').astype(int)


In [14]:
print(X_train.dtypes)


Occupation                      int64
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Gender_M                         bool
Age_18-25                        bool
Age_26-35                        bool
Age_36-45                        bool
Age_46-50                        bool
Age_51-55                        bool
Age_55+                          bool
City_Category_B                  bool
City_Category_C                  bool
dtype: object


In [15]:
X_train = pd.get_dummies(train_data.drop(columns=['User_ID', 'Product_ID', 'Purchase']), drop_first=True)
y_train = train_data['Purchase']

X_test = pd.get_dummies(test_data.drop(columns=['User_ID', 'Product_ID']), drop_first=True)



In [16]:
# Align columns
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


In [17]:
# Split the data again if needed
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and fit the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)


In [18]:
y_val_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Validation RMSE: {rmse}')


Validation RMSE: 3060.3213439392616


In [19]:
model.fit(X_train, y_train)


In [20]:
imputer = SimpleImputer(strategy='mean')
X_test_imputed = imputer.fit_transform(X_test)
test_predictions = model.predict(X_test_imputed)



In [21]:
submission = pd.DataFrame({'Purchase': test_predictions,'User_ID': test_data['User_ID'],'Product_ID' : test_data['Product_ID']})
submission.to_csv('SampleSubmission.csv', index=False)


In [22]:
saved_submission = pd.read_csv('SampleSubmission.csv')
print(saved_submission.head())

       Purchase  User_ID Product_ID
0  16747.550833  1000004  P00128942
1  10643.513833  1000009  P00113442
2   8084.323337  1000010  P00288442
3   2257.897333  1000010  P00145342
4   2654.679512  1000011  P00053842


In [23]:
try:
    submission.to_csv(r"C:\Users\This PC\Documents\SampleSubmission.csv", index=False)
    print("File saved successfully!")
except Exception as e:
    print(f"Error saving file: {e}")


File saved successfully!


In [24]:
submission.to_csv(r"C:\Temp\SampleSubmission.csv", index=False)
