In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e1/sample_submission.csv
/kaggle/input/playground-series-s5e1/train.csv
/kaggle/input/playground-series-s5e1/test.csv


In [2]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [3]:
train_data=pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
test_data=pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')
sample_submission=pd.read_csv('/kaggle/input/playground-series-s5e1/sample_submission.csv')

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        230130 non-null  int64  
 1   date      230130 non-null  object 
 2   country   230130 non-null  object 
 3   store     230130 non-null  object 
 4   product   230130 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.5+ MB


Checking null values

In [5]:
train_data.isnull().sum()

id             0
date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64

In [6]:
#filling in null values with median values
train_data['num_sold']=train_data['num_sold'].fillna(train_data['num_sold'].median())

In [7]:
train_data.isnull().sum()

id          0
date        0
country     0
store       0
product     0
num_sold    0
dtype: int64

Converting date to datetime and extracting features

In [8]:
train_data['date']=pd.to_datetime(train_data['date'])
test_data['date']=pd.to_datetime(test_data['date'])
train_data['year']=train_data['date'].dt.year
train_data['month']=train_data['date'].dt.month
train_data['day']=train_data['date'].dt.day
test_data['year']=test_data['date'].dt.year
test_data['month']=test_data['date'].dt.month
test_data['day']=test_data['date'].dt.day

Encoding

In [9]:
categorical_columns=[feature for feature in train_data.columns if train_data[feature].dtype=='O']
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded_train = encoder.fit_transform(train_data[categorical_columns])
one_hot_df_train= pd.DataFrame(one_hot_encoded_train, columns=encoder.get_feature_names_out(categorical_columns))
train_data_encoded=pd.concat([train_data, one_hot_df_train],axis=1)
train_data_encoded=train_data_encoded.drop(categorical_columns, axis=1)

#encoding test data
one_hot_encoded_test=encoder.transform(test_data[categorical_columns])
one_hot_df_test=pd.DataFrame(one_hot_encoded_test, columns=encoder.get_feature_names_out(categorical_columns))
test_data_encoded=pd.concat([test_data,one_hot_df_test],axis=1)


In [10]:
y=train_data_encoded['num_sold']
train_data_encoded=train_data_encoded.drop(['id', 'date','num_sold'], axis=1)

In [11]:
X_train, X_test,y_train,y_test=train_test_split(train_data_encoded, y, test_size=0.2, random_state=42) 
#using linear regression
model=LinearRegression()
model.fit(X_train,y_train)


In [12]:
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")
print("Accuracy:", model.score(X_train,y_train))

Mean Absolute Error (MAE): 254.30
R² Score: 0.71
Accuracy: 0.7064532457172223


In [13]:
test_data_encoded=test_data_encoded.drop(categorical_columns,axis=1)
test_data_encoded=test_data_encoded.drop(['id','date'],axis=1)
y_predicted=model.predict(test_data_encoded)

In [14]:
sample_submission['num_sold']=y_predicted

In [15]:
sample_submission

Unnamed: 0,id,num_sold
0,230130,-9.442787
1,230131,946.392685
2,230132,732.633345
3,230133,271.649939
4,230134,365.715194
...,...,...
98545,328675,531.457882
98546,328676,1487.293354
98547,328677,1273.534013
98548,328678,812.550608


In [16]:
sample_submission.to_csv("submission.csv",index=False)