In [91]:
import numpy as np
import pandas as pd 
df = pd.read_csv('train_data.csv')

In [92]:
X = df.drop('purchaseValue',axis=1)
Y = df['purchaseValue']

In [93]:
X = X.replace('(not set)',np.nan)
X = X.replace('(not available)',np.nan)
X = X.replace('not available in demo dataset',np.nan)
X = X.replace('(not provided)',np.nan)

In [94]:
missing_percent = X.isnull().mean()
cols_to_drop = missing_percent[missing_percent > 0.7].index
X.drop(columns=cols_to_drop, inplace=True)

In [95]:
df['purchaseValue'].isna().sum()

0

In [96]:
print(X.shape[1])

30


In [97]:
X.drop(columns= ['userId', 'sessionId'],inplace=True)

In [98]:
print(X.shape[1])

28


In [99]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [100]:
print(X_train.shape[0])

92818


In [101]:
print(X_test.shape[0])

23205


In [102]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object', 'category', 'bool']).columns


num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [103]:
for col in cat_cols:
    train_cats = set(X_train[col].dropna().unique())
    most_freq = X_train[col].mode()[0]
    X_test[col] = X_test[col].apply(lambda x: x if x in train_cats else most_freq)

In [104]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [105]:
X_train.shape[1]

28

In [106]:
print(np.mean(X_train[:, 0]))

5.695487894648824e-17


In [107]:
print(np.std(X_train[:, 0]))

1.0


In [108]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score

lr = LinearRegression()
lr.fit(X_train,Y_train)
print(r2_score(Y_test,lr.predict(X_test)))

rf = RandomForestRegressor(random_state=42,n_estimators=100)
rf.fit(X_train,Y_train)
print(r2_score(Y_test,rf.predict(X_test)))

gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train,Y_train)
print(r2_score(Y_test,gbr.predict(X_test)))

0.12691283261415776
0.13112522094311274
-0.0877873765474888


In [111]:
from sklearn.metrics import root_mean_squared_error
print(root_mean_squared_error(Y_test,rf.predict(X_test)))

202658397.85060716


In [112]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(Y_test,rf.predict(X_test)))

26153940.310277957
