In [44]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline 

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
holiday = pd.read_csv("holidays_events.csv")
stores = pd.read_csv("stores.csv")
oil = pd.read_csv("oil.csv")
transactions = pd.read_csv("transactions.csv")

#print(train.head())

data = pd.concat([train,test])
data = data.merge(holiday, "left", ["date"]).rename(columns={"type" : "holiday_type"})
data = data.merge(stores, "left", ["store_nbr"]).rename(columns={'type':'city_type'})
data = data.merge(transactions, "left", ["store_nbr", "date"])
data = data.merge(oil, "left", ["date"])

print(data.columns)

data.date = pd.to_datetime(data.date)
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['week'] = data['date'].dt.isocalendar().week
data['quarter'] = data['date'].dt.quarter
data['day_of_week'] = data['date'].dt.day_name()

data['dcoilwtico'] = data['dcoilwtico'].fillna(method='bfill')
data.transactions = data.transactions.replace(np.nan,0)
data['holiday_type'] = data['holiday_type'].replace(np.nan,'Working Day')
data['transferred'] = data['transferred'].replace(np.nan,False)




Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion',
       'holiday_type', 'locale', 'locale_name', 'description', 'transferred',
       'city', 'state', 'city_type', 'cluster', 'transactions', 'dcoilwtico'],
      dtype='object')


In [None]:
sns.lineplot(x='date', y='sales', data=data)
plt.xlabel('Date')
plt.ylabel('Sales')
plt.title('Sales Over Time')
plt.show()

In [None]:
plt.scatter(x=data['date'], y=data['dcoilwtico'])

In [None]:
data['holiday_type'].unique()

In [None]:
data['date'] = pd.to_datetime(data['date'])

# Aggregate the data on a monthly basis and determine the tendency of oil price
monthly_data = data.groupby(data['date'].dt.to_period('M')).agg({'sales': 'sum', 'dcoilwtico': lambda x: 'Increasing' if x.diff().mean() > 0 else 'Decreasing'})

# Reset the index and rename the columns
monthly_data = monthly_data.reset_index()
monthly_data['Month'] = monthly_data['date'].dt.strftime('%Y-%m')
monthly_data = monthly_data.drop('date', axis=1)

# Plot sales over time with color coded by monthly oil price tendency
sns.lineplot(x='Month', y='sales', hue='dcoilwtico', data=monthly_data)
plt.xlabel('Month')
plt.ylabel('Sales')
plt.title('Sales Over Time (Color Coded by Monthly Oil Price Tendency)')
plt.legend(title='Oil Price Tendency')
plt.show()

In [52]:
dummies = pd.get_dummies(data[['store_nbr', 'family', 'sales', 'onpromotion',
       'holiday_type', 'locale', 'locale_name', 'description', 'transferred',
       'city', 'state', 'city_type', 'cluster', 'transactions', 'dcoilwtico']])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dummies[['store_nbr', 'sales', 'onpromotion', 'transferred',
         'cluster', 'transactions', 'dcoilwtico']] = scaler.fit_transform(dummies[[
                'store_nbr', 'sales', 'onpromotion', 'transferred', 'cluster',
                'transactions', 'dcoilwtico']])

In [64]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

filtered_train_data = dummies[data['date'] <= '2017-08-15']
filtered_test_data = dummies[data['date'] > '2017-08-15']

x_train = filtered_train_data[['store_nbr', 'dcoilwtico', 'transactions']]
x_test = filtered_test_data[['store_nbr', 'dcoilwtico', 'transactions']]
y_train = filtered_train_data['sales']

# Perform one-hot encoding on the 'holiday_type' column
encoder = OneHotEncoder(sparse=False, drop='first')
holiday_type_encoded = encoder.fit_transform(filtered_train_data[['holiday_type']])
x_train = np.hstack((x_train.values, holiday_type_encoded))

holiday_type_encoded = encoder.transform(filtered_test_data[['holiday_type']])
x_test = np.hstack((x_test.values, holiday_type_encoded))

scaled_X_train = scaler.fit_transform(x_train)
scaled_X_test = scaler.fit_transform(x_test)

# Fit the model and predict sales
model = LinearRegression()
model.fit(scaled_X_train, y_train)
predicted_sales = model.predict(scaled_X_test)

# Create the submission DataFrame
submission = pd.DataFrame({'id': filtered_test_data.index, 'sales': predicted_sales})
submission.to_csv('submission.csv', index=False)

