In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle
data = {
    'OrderDate': ['1-6-18', '1-23-18', '2-9-18', '2-26-18', '3-15-18', '4-1-18', '4-18-18', '5-5-18', '5-22-18', '6-8-18', '6-25-18', '7-12-18', '7-29-18', '8-15-18', '9-1-18', '9-18-18', '10-5-18', '10-22-18', '11-8-18', '11-25-18', '12-12-18', '12-29-18', '1-15-19', '2-1-19', '2-18-19', '3-7-19', '3-24-19', '4-10-19', '4-27-19', '5-14-19', '5-31-19', '6-17-19', '7-4-19', '7-21-19', '8-7-19', '8-24-19', '9-10-19', '9-27-19', '10-14-19', '10-31-19', '11-17-19', '12-4-19', '12-21-19'],
    'Region': ['East', 'Central', 'Central', 'Central', 'West', 'East', 'Central', 'Central', 'West', 'East', 'Central', 'East', 'East', 'East', 'Central', 'East', 'Central', 'East', 'East', 'Central', 'Central', 'East', 'Central', 'East', 'West', 'Central', 'Central', 'East', 'Central', 'East', 'Central', 'Central', 'East', 'Central', 'Central', 'West', 'Central', 'Central', 'West', 'West', 'Central', 'Central', 'Central', 'Central', 'Central'],
    'Manager': ['Martha', 'Hermann', 'Hermann', 'Timothy', 'Timothy', 'Martha', 'Martha', 'Hermann', 'Douglas', 'Martha', 'Hermann', 'Martha', 'Douglas', 'Martha', 'Douglas', 'Martha', 'Hermann', 'Martha', 'Douglas', 'Hermann', 'Douglas', 'Martha', 'Martha', 'Douglas', 'Martha', 'Timothy', 'Hermann', 'Martha', 'Martha', 'Timothy', 'Hermann', 'Martha', 'Douglas', 'Martha', 'Hermann', 'Martha', 'Douglas', 'Timothy', 'Timothy', 'Douglas', 'Martha', 'Hermann', 'Hermann', 'Martha', 'Marth'],
    'SalesMan': ['Alexander', 'Shelli', 'Luis', 'David', 'Stephen', 'Alexander', 'Steven', 'Luis', 'Michael', 'Alexander', 'Sigal', 'Diana', 'Karen', 'Alexander', 'John', 'Alexander', 'Sigal', 'Alexander', 'Karen', 'Shelli', 'John', 'Karen', 'Alexander', 'David', 'Alexander', 'Stephen', 'Luis', 'Steven', 'Diana', 'David', 'Sigal', 'Shelli', 'Alex', 'Karen', 'Alexander', 'Sigal', 'Shelli', 'Stephen', 'David', 'Stephen', 'Michael', 'Steven', 'Luis', 'Luis', 'Steven', 'Steven'],
    'Item': ['Television', 'Home Theater', 'Television', 'Cell Phone', 'Television', 'Home Theater', 'Television', 'Television', 'Television', 'Home Theater', 'Television', 'Home Theater', 'Home Theater', 'Television', 'Desk', 'Video Games', 'Home Theater', 'Cell Phone', 'Video Games', 'Television', 'Television', 'Video Games', 'Home Theater', 'Home Theater', 'Home Theater', 'Video Games', 'Television', 'Cell Phone', 'Television', 'Home Theater', 'Video Games', 'Video Games', 'Desk', 'Video Games', 'Video Games', 'Desk', 'Television', 'Cell Phone', 'Home Theater', 'Television', 'Home Theater', 'Home Theater', 'Home Theater', 'Home Theater', 'Home Theater', 'Home Theater'],
    'Units': [95, 50, 36, 27, 56, 60, 75, 90, 32, 60, 90, 29, 81, 35, 2, 16, 28, 64, 15, 96, 67, 74, 46, 87, 4, 7, 50, 66, 96, 53, 80, 5, 62, 55, 42, 3, 7, 76, 57, 14, 11, 94, 28],
    'Unit_price': [1198.00, 500, 1198.00, 225, 1198.00, 500, 1198.00, 1198.00, 1198.00, 500, 1198.00, 500, 500, 1198.00, 125, 58.5, 500, 225, 225, 1198.00, 1198.00, 58.5, 500, 500, 500, 58.5, 1198.00, 225, 1198.00, 58.5, 58.5, 125, 58.5, 58.5, 58.5, 125, 1198.00, 225, 500, 1198.00, 500, 500, 500, 500, 500],
    'Sale_amt': ['113810.00', '25000.00', '43128.00', '6075.00', '67088.00', '30000.00', '89850.00', '107820.00', '38336.00', '30000.00', '107820.00', '14500.00', '40500.00', '41930.00', '250', '936.00', '14000.00', '14400.00', '3375.00', '5616.00', '80266.00', '4329.00', '23000.00', '43500.00', '2000.00', '3500.00', '2925.00','567.00']
}
# Preprocess the data
# Convert 'OrderDate' to datetime
data['OrderDate'] = pd.to_datetime(data['OrderDate'], format='%m-%d-%y')

# Convert 'Sale_amt' to float
data['Sale_amt'] = data['Sale_amt'].str.replace(',', '').astype(float)

# Check for missing values
print(data.isnull().sum())

# There are no missing values in the dataset.

# Now, let's check linear regression assumptions

# Step 1: Linearity (check by visualizing scatter plots)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
sns.pairplot(data=data, y_vars=['Sale_amt'], x_vars=['Units', 'Unit_price'])
plt.show()

# Step 2: Independence of errors (assumed)

# Step 3: Homoscedasticity (check using scatter plot of residuals)
# Residuals vs Fitted plot
from statsmodels.stats.outliers_influence import OLSInfluence

# Fit the model
X = sm.add_constant(data[['Units', 'Unit_price']])
model = sm.OLS(data['Sale_amt'], X).fit()

# Get the residuals
residuals = model.resid

# Get the fitted values
fitted_values = model.fittedvalues

# Plot residuals vs fitted values
plt.figure(figsize=(10, 6))
sns.scatterplot(x=fitted_values, y=residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted Values')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

# Step 4: Normality of residuals (check using Q-Q plot)
sm.qqplot(residuals, line ='45')
plt.title('Q-Q Plot of Residuals')
plt.show()

# Now, let's train the linear regression model

# Split the data into features and target variable
X = data[['Units', 'Unit_price']]
y = data['Sale_amt']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Save the model to a pickle file
import pickle

with open('linear_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)