# Simple Linear Regression

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## 2. Load the Dataset

In [None]:
# Get dataset
df_sales = pd.read_csv('Sales_with_NaNs_v1.3.csv')
df_sales.head()

## 3. Data Analysis and Cleaning
### Descriptive Statistics
We use the .describe() function to get a statistical summary of the numerical columns in the dataset.

In [None]:
# Describe data
df_sales.describe()

### Handling Missing Values
The dataset contains missing values (NaNs). For a simple linear regression model, we need complete data for our variables of interest. We will use 'Sales_Before' as the independent variable and 'Sales_After' as the dependent variable. We will remove any rows where either of these columns has a missing value.

In [None]:
# Drop rows with missing values in 'Sales_Before' or 'Sales_After'
df_sales_cleaned = df_sales.dropna(subset=['Sales_Before', 'Sales_After'])

### Data Distribution
The distplot function from seaborn is deprecated.
We will use histplot, which is the recommended alternative, to visualize the distribution of 'Sales_After'.


In [None]:
# Data distribution plot for 'Sales_After'
plt.title('Sales After Distribution Plot')
sns.histplot(df_sales_cleaned['Sales_After'], kde=True)
plt.show()

### Relationship between Sales Before and Sales After

A scatter plot is used to visualize the relationship between 'Sales_Before' and 'Sales_After'. This helps us visually determine if there is a linear relationship between the two variables.

In [None]:
# Relationship between Sales_After and Sales_Before
plt.scatter(df_sales_cleaned['Sales_Before'], df_sales_cleaned['Sales_After'], color='lightcoral')
plt.title('Sales After vs Sales Before')
plt.xlabel('Sales Before')
plt.ylabel('Sales After')
plt.box(False)
plt.show()

## 4. Splitting the Dataset
We will now split the data into independent (X) and dependent (y) variables and then further divide them into training and testing sets. 80% of the data will be used for training the model and 20% for testing it.

In [None]:
# Splitting variables
# Independent variable (X)
X = df_sales_cleaned[['Sales_Before']]
# Dependent variable (y)
y = df_sales_cleaned[['Sales_After']]

# Splitting dataset into test/train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## 5. Train the Regression Model
We will create an instance of the LinearRegression model from scikit-learn and train it using our training data (X_train and y_train).[10][11]

In [None]:
# Regressor model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## 6. Predict the Results
With the model trained, we can now predict the 'Sales_After' values for both our training and testing sets.

In [None]:
# Prediction result
y_pred_test = regressor.predict(X_test)   # predicted value of y_test
y_pred_train = regressor.predict(X_train) # predicted value of y_train

## 7. Plot the Training and Test Results
### Training Set Results
Here, we visualize the model's performance on the data it was trained on. The scatter plot shows the actual data points, and the line plot shows the regression line our model has learned.

In [None]:
# Prediction on training set
plt.scatter(X_train, y_train, color='lightcoral')
plt.plot(X_train, y_pred_train, color='firebrick')
plt.title('Sales vs Experience (Training Set)')
plt.xlabel('Sales Before')
plt.ylabel('Sales After')
plt.legend(['Predicted', 'Actual'])
plt.box(False)
plt.show()

### Test Set Results
This plot shows how well our model generalizes to new, unseen data. We plot the actual test data points and overlay the same regression line learned from the training data.

In [None]:
# Prediction on test set
plt.scatter(X_test, y_test, color='lightcoral')
plt.plot(X_train, y_pred_train, color='firebrick')
plt.title('Sales vs Experience (Test Set)')
plt.xlabel('Sales Before')
plt.ylabel('Sales After')
plt.legend(['Predicted', 'Actual'])
plt.box(False)
plt.show()

## 8. Regressor Coefficients and Intercept
Finally, we can extract the slope (coefficient) and the y-intercept from our trained regressor model. These values define the linear equation y = mx + c.

In [None]:
# Regressor coefficients and intercept
print(f'Coefficient: {regressor.coef_}')
print(f'Intercept: {regressor.intercept_}')