#**LOADING DATASET**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from tabulate import tabulate
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
!pip install tabulate
!pip install pandas statsmodels



In [None]:
# import dataset
df = pd.read_csv('retail_sales_dataset.csv')
df.head(5)

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


In [None]:
df.shape

(1000, 9)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB


#**ANALISIS**

In [None]:
# Persiapkan fitur dan target
X = df[['Quantity', 'Price per Unit']]  # Fitur
y = df['Total Amount']  # Target

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Buat dan latih model regresi linier
model = LinearRegression()
model.fit(X_train, y_train)

# Prediksi dengan data uji
y_pred = model.predict(X_test)

# Evaluasi model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Siapkan data untuk tabel
data = [
    ["Koefisien", f"{model.coef_[0]:.8f} (Quantity), {model.coef_[1]:.8f} (Price per Unit)"],
    ["Intersep", f"{model.intercept_:.8f}"],
    ["Mean Squared Error (MSE)", f"{mse:.8f}"],
    ["R-squared", f"{r2:.8f}"]
]

# Tampilkan hasil dalam format tabel
print(tabulate(data, headers=['Metrik', 'Nilai'], tablefmt='simple'))

Metrik                    Nilai
------------------------  ----------------------------------------------------
Koefisien                 179.77579672 (Quantity), 2.48983555 (Price per Unit)
Intersep                  -439.26614328
Mean Squared Error (MSE)  41860.81317289
R-squared                 0.85699816


In [None]:
# Mendefinisikan fitur dan target
X = df[['Quantity', 'Price per Unit']]
y = df['Total Amount']

# Menambahkan konstanta untuk intersep
X = sm.add_constant(X)

# Membagi data menjadi data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Membuat model regresi menggunakan statsmodels
model = sm.OLS(y_train, X_train).fit()

# Mengambil informasi hasil analisis
coefficients = model.params
standard_errors = model.bse
t_values = model.tvalues
p_values = model.pvalues

# Membuat tabel data dengan pembatasan 3 desimal
data = []
for feature in coefficients.index:
    data.append([
        feature,
        round(coefficients[feature], 3),
        round(standard_errors[feature], 3),
        round(t_values[feature], 3),
        round(p_values[feature], 3)
    ])

# Menampilkan tabel hasil analisis regresi
headers = ["Fitur", "Koefisien", "Std Error", "t-value", "p-value"]
print(tabulate(data, headers=headers, tablefmt="grid"))

# Evaluasi model dengan data uji
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nEvaluasi Model:")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"R-squared (R²): {r2:.3f}")


+----------------+-------------+-------------+-----------+-----------+
| Fitur          |   Koefisien |   Std Error |   t-value |   p-value |
| const          |    -439.266 |      19.687 |   -22.312 |         0 |
+----------------+-------------+-------------+-----------+-----------+
| Quantity       |     179.776 |       6.772 |    26.547 |         0 |
+----------------+-------------+-------------+-----------+-----------+
| Price per Unit |       2.49  |       0.04  |    61.781 |         0 |
+----------------+-------------+-------------+-----------+-----------+

Evaluasi Model:
Mean Squared Error (MSE): 41860.813
R-squared (R²): 0.857


**Interpretasi**

Melalui perhitungan di atas, antara variabel independen (Quantity dan Price Per Unit) serta variabel dependen (Total Amount) menghasilkan nilai p-value 0. Hal tersebut terjadi karena hubungan antara variabel tersebut memiliki hubungan yang kuat.

Pada perhitungan MSE didapati hasil sebesar 41860. Nilai tersebut menunjukkan rata-rata kesalahan kuadrat antara prediksi model dan nilai aktual. Meskipun angka error cukup beasar, nilai tersebut masih wajar dalam konteks data penjualan dengan total amount bernilai besar.

Hasil pada perhitungan R-SQUARED menunjukkan bahwasannya model mampu menjelaskan 85.7% dari variabilitas dalam total amount, hal tersebut membuktikan bahwa model sangat baik dalam memprediksi variabel target.
