# Multiple Linear Regression

## 1. Import Libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

## 2. Import Data

In [None]:
# Get dataset
df_start = pd.read_csv('Sales_with_NaNs_v1.3.csv')
df_start.head()

# 3. Analyze Data
## Handling Missing Values
The dataset contains missing values. To ensure the accuracy of our model, we remove rows with any missing data.

In [None]:
# Drop rows with missing values
df_start.dropna(inplace=True)

## Describe
We generate descriptive statistics for the dataset to understand the central tendency, dispersion, and shape of the dataset's distribution.

In [None]:
# Describe data
df_start.describe()

## Distribution
A distribution plot is created for 'Sales_After' to visualize how the data is distributed.

In [None]:
# Data distribution
plt.title('Sales After Distribution Plot')
sns.distplot(df_start['Sales_After'])
plt.show()

## Relationship between Sales Before and Sales After
A scatter plot is used to visualize the relationship between 'Sales_Before' and 'Sales_After'.

In [None]:
# Relationship between Sales Before and Sales After
plt.scatter(df_start['Sales_Before'], df_start['Sales_After'], color = 'lightcoral')
plt.title('Sales After vs Sales Before')
plt.xlabel('Sales Before')
plt.ylabel('Sales After')
plt.box(False)
plt.show()

# 4. Split into Independent/Dependent variables
The dataset is split into independent variables (X) and the dependent variable (y), which is 'Sales_After'.

In [None]:
# Split dataset in dependent/independent variables
X = df_start.drop('Sales_After', axis=1)
y = df_start['Sales_After']

## 5. One-hot encoding
Categorical variables are converted into a numerical format using one-hot encoding.

In [None]:
# One-hot encoding of categorical data
categorical_features = X.select_dtypes(include=['object']).columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## 6. Split into Train/Test sets
The dataset is divided into training and testing sets to evaluate the model's performance on unseen data.

In [None]:
# Split dataset into test/train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## 7. Train model
A multiple linear regression model is trained on the training data.

In [None]:
# Train multiple regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## 8. Predict results
The trained model is used to make predictions on the test set.

In [None]:
# Predict result
y_pred = regressor.predict(X_test)

## 9. Compare predictions
Finally, the predicted 'Sales_After' values are compared with the actual values from the test set.

In [None]:
# Compare predicted result with actual value
np.set_printoptions(precision=2)
result = np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1)
print(result)