In [None]:
# Dataset -> https://www.kaggle.com/datasets/kolawale/focusing-on-mobile-app-or-website

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("Ecommerce.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
sns.jointplot(x='Length of Membership', y='Yearly Amount Spent', data=df, alpha=0.4)

In [None]:
sns.pairplot(df, kind='scatter', plot_kws={'alpha': 0.4})

In [None]:
sns.lmplot(x='Length of Membership', y='Yearly Amount Spent', data=df, scatter_kws={'alpha': 0.4})

In [None]:
from sklearn.model_selection import train_test_split
X = df[['Avg. Session Length', 'Time on App', 'Time on Website', 'Length of Membership']]
y = df['Yearly Amount Spent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.coef_

In [None]:
cdf = pd.DataFrame(lr.coef_, X.columns, columns=['coef'])
print(cdf)
# Higher the coef higher the impact.

In [None]:
# Scoring accuracy of the model
score = lr.score(X_test, y_test)
print(f"Accuracy: {score*100: .2f}%")

In [None]:
# Prediction
prediction = lr.predict(X_test)

In [None]:
prediction

In [None]:
sns.scatterplot(x=prediction, y=y_test)
plt.xlabel('Prediction')
plt.ylabel('Yearly Amount Spent')

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

In [None]:
mae = mean_absolute_error(y_test, prediction)
mse = mean_squared_error(y_test, prediction)
rmse = math.sqrt(mse)

In [None]:
print("Mean Absolute Error ", mae)
print("Mean Squared Error ", mse)
print("Root Mean Squared Error ", rmse)

## Residuals¶
In regression analysis, residuals are the differences between the observed values and the predicted values from the regression model. They represent the error in prediction and are crucial for assessing the quality of the model. Specifically, a residual is the vertical distance between a data point and the regression line.

In [None]:
residuals = y_test - prediction
residuals

In [None]:
sns.displot(residuals, bins=24, kde=True)