<a href="https://colab.research.google.com/github/SriKrishnaKoduri/Datascience-projeect/blob/main/Internship.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 📦 Package Installation & Imports  
This section installs necessary packages (`prophet`, `mlxtend`) and imports all required libraries such as `pandas`, `numpy`, `sklearn`, `seaborn`, `plotly`, and more.


In [None]:
# 1.1 Install if needed (Prophet)
!pip install prophet mlxtend --quiet

# 1.2 Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from statsmodels.tsa.seasonal import STL
from prophet import Prophet
from mlxtend.frequent_patterns import apriori, association_rules
from datetime import timedelta
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")


In [None]:
# 2.1 Upload dataset from local
from google.colab import files
uploaded = files.upload()

# 2.2 Load dataset
df = pd.read_csv("retail_data.csv")
df.head()


KeyboardInterrupt: 

## 📁 Data Upload and Initial Exploration  
Here we upload the retail dataset from the local system and load it into a pandas DataFrame for further analysis.


In [None]:
# 3.1 Convert date
df['Date of Purchase'] = pd.to_datetime(df['Date of Purchase'])

# 3.2 KNN Imputation
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# 3.2.1 (Optional) Iterative Imputer using MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Uncomment this block if you want to use Iterative Imputer instead of KNN
# mice_imputer = IterativeImputer(random_state=0)
# df[numeric_cols] = mice_imputer.fit_transform(df[numeric_cols])

# 3.3 Outlier Handling (Tukey’s method)
def cap_outliers(col):
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower, upper)

for col in numeric_cols:
    cap_outliers(col)

# 3.3.1 (Optional) Robust Z-Score method for outlier detection
from scipy.stats import median_abs_deviation

def robust_z_score(series):
    median = np.median(series)
    mad = median_abs_deviation(series)
    return (series - median) / (1.4826 * mad)

# Flagging values with RZS > 3 as outliers
for col in numeric_cols:
    rzs = robust_z_score(df[col])
    df[col] = np.where(np.abs(rzs) > 3, np.sign(rzs) * 3, df[col])  # capping at ±3

# 3.4 Standardization
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# 3.4.1 (Optional) Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler

# Uncomment to try Min-Max instead of StandardScaler
# scaler = MinMaxScaler()
# df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# 3.5 Feature Engineering
today = df['Date of Purchase'].max()
recency = df.groupby('Customer ID')['Date of Purchase'].max().apply(lambda x: (today - x).days)
frequency = df.groupby('Customer ID')['Invoice No'].nunique()
monetary = df.groupby('Customer ID')['Selling Price'].sum()
rfm = pd.DataFrame({'Recency': recency, 'Frequency': frequency, 'Monetary': monetary})
rfm = rfm.merge(df[['Customer ID', 'Loyalty Score']].drop_duplicates(), on='Customer ID')

# 3.5.1 Approximate Customer Lifetime Value (CLV)
# Assumptions: Avg Order Value * Purchase Frequency * (Profit Margin)
df['Total'] = df['Selling Price']  # Assuming 'Selling Price' = order value
clv = df.groupby('Customer ID').agg({
    'Invoice No': 'nunique',
    'Selling Price': ['sum', 'mean']
})
clv.columns = ['Purchase_Frequency', 'Total_Revenue', 'Avg_Order_Value']

# Assuming profit margin of 10%
clv['CLV'] = clv['Avg_Order_Value'] * clv['Purchase_Frequency'] * 0.10

# Merge back with RFM
rfm = rfm.merge(clv['CLV'], left_index=True, right_index=True)

# 3.5.2 Average Purchase Frequency
# Frequency / Time span (in months)
first_purchase = df.groupby('Customer ID')['Date of Purchase'].min()
last_purchase = df.groupby('Customer ID')['Date of Purchase'].max()
purchase_span_months = (last_purchase - first_purchase).dt.days / 30

avg_frequency = frequency / purchase_span_months
rfm['Avg_Purchase_Frequency'] = avg_frequency

# 3.5.3 Discount Utilization Rate
# Assuming you have 'Discount Amount' and 'Selling Price' columns

if 'Discount Amount' in df.columns and 'Selling Price' in df.columns:
    df['Discount Utilization'] = df['Discount Amount'] / (df['Selling Price'] + df['Discount Amount'])
    discount_util = df.groupby('Customer ID')['Discount Utilization'].mean().fillna(0)
    rfm = rfm.merge(discount_util.rename('Avg_Discount_Utilization'), on='Customer ID')
else:
    print("Discount Amount column not found. Skipping discount utilization calculation.")

# 3.5.4 Payment Method Preference
# Assuming 'Payment Method' column is available

if 'Payment Method' in df.columns:
    payment_pref = df.groupby('Customer ID')['Payment Method'].agg(lambda x: x.value_counts().idxmax())
    rfm = rfm.merge(payment_pref.rename('Preferred_Payment_Method'), on='Customer ID')
else:
    print("Payment Method column not found. Skipping payment preference calculation.")


In [None]:
# Loyalty Score based on frequency and monetary value
rfm['Loyalty Score'] = pd.qcut(rfm['Frequency'], q=4, labels=[1, 2, 3, 4]).astype(int)

# Customer Lifetime Value (CLV = Frequency × Monetary)
rfm['CLV'] = rfm['Frequency'] * rfm['Monetary']

# Discount Utilization Rate (if 'Discount' column exists)
if 'Discount' in df.columns:
    discount_utilization = df.groupby('Customer ID')['Discount'].mean()
    rfm['Discount Utilization Rate'] = discount_utilization

# Payment Method Preference (if 'Payment Method' column exists)
if 'Payment Method' in df.columns:
    preferred_payment = df.groupby('Customer ID')['Payment Method'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'Unknown')
    rfm['Payment Method Preference'] = preferred_payment


## 🧹 Data Preprocessing  
We perform preprocessing such as date conversion, handling missing values using `KNNImputer`, and standardizing numerical features.


In [None]:
# 4.1 Recency Histogram
sns.histplot(rfm['Recency'], bins=30, kde=True)
plt.title("Recency Distribution")
plt.show()

# 4.2 Monthly Sales
monthly_sales = df.set_index('Date of Purchase').resample('M')['Selling Price'].sum()
monthly_sales.plot(title="Monthly Sales Over Time")
plt.show()

# 4.3 STL Decomposition
stl = STL(monthly_sales, seasonal=13)
res = stl.fit()
res.plot()
plt.show()

# 4.4 Correlation Matrix
sns.heatmap(df.corr(), annot=True, fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# SARIMA Forecasting
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Train-test split
train = monthly_sales.iloc[:-12]
test = monthly_sales.iloc[-12:]

# Fit SARIMA Model
sarima_model = SARIMAX(train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
sarima_result = sarima_model.fit(disp=False)

# Forecast
sarima_forecast = sarima_result.predict(start=test.index[0], end=test.index[-1])

# Plot SARIMA forecast
plt.figure(figsize=(10, 5))
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test')
plt.plot(test.index, sarima_forecast, label='SARIMA Forecast')
plt.legend()
plt.title("SARIMA Forecast vs Actual")
plt.show()


In [None]:
# Prophet Forecasting with Custom Seasonality
df_prophet = monthly_sales.reset_index()
df_prophet.columns = ['ds', 'y']

# Define holiday example
holidays = pd.DataFrame({
    'holiday': 'festival',
    'ds': pd.to_datetime(['2021-12-25', '2022-01-01', '2022-10-24']),
    'lower_window': 0,
    'upper_window': 1,
})

# Initialize Prophet with holidays
m = Prophet(holidays=holidays)
m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
m.fit(df_prophet)

# Forecast
future = m.make_future_dataframe(periods=12, freq='M')
forecast = m.predict(future)

# Plot
fig1 = m.plot(forecast)
plt.title("Prophet Forecast with Custom Seasonality and Holidays")
plt.show()


## 🛠️ Feature Engineering  
In this section, we derive new features including RFM metrics (Recency, Frequency, Monetary), Customer Lifetime Value (CLV), and Loyalty Score.


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Monthly sales aggregation
monthly_sales = df.resample('M', on='Date of Purchase')['Selling Price'].sum()

# Train-test split
train = monthly_sales[:-12]
test = monthly_sales[-12:]

# SARIMA Model (adjust order as needed)
sarima_model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,12))
sarima_result = sarima_model.fit(disp=False)

# Forecasting
sarima_forecast = sarima_result.forecast(steps=12)
sarima_forecast.index = test.index

# Evaluation
sarima_rmse = mean_squared_error(test, sarima_forecast, squared=False)
sarima_mae = mean_absolute_error(test, sarima_forecast)

print("SARIMA RMSE:", sarima_rmse)
print("SARIMA MAE:", sarima_mae)

# Plot
plt.figure(figsize=(10,4))
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test')
plt.plot(sarima_forecast.index, sarima_forecast, label='SARIMA Forecast')
plt.legend()
plt.title("SARIMA Forecast vs Actuals")
plt.show()


In [None]:
# LSTM for Time-Series Forecasting
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler

# Prepare data
scaler = MinMaxScaler()
scaled_series = scaler.fit_transform(monthly_sales.values.reshape(-1, 1))

# Sequence generator
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

seq_length = 12
X, y = create_sequences(scaled_series, seq_length)

# Train-test split
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# LSTM Model
model = Sequential([
    LSTM(64, activation='relu', input_shape=(seq_length, 1)),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=30, verbose=0)

# Forecast
lstm_pred = model.predict(X_test)
lstm_pred_inv = scaler.inverse_transform(lstm_pred)
y_test_inv = scaler.inverse_transform(y_test)

# Plot
plt.plot(y_test_inv, label='Actual')
plt.plot(lstm_pred_inv, label='LSTM Forecast')
plt.title("LSTM Forecast vs Actual")
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def print_metrics(true, pred, model_name):
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    mape = np.mean(np.abs((true - pred) / true)) * 100
    print(f"{model_name}:\n  MAE: {mae:.2f}, RMSE: {rmse:.2f}, MAPE: {mape:.2f}%\n")

# Evaluate SARIMA
print_metrics(test.values, sarima_forecast.values, "SARIMA")

# Evaluate Prophet
prophet_pred = forecast.set_index('ds')['yhat'][-12:]
print_metrics(test.values, prophet_pred.values, "Prophet")

# Evaluate LSTM
print_metrics(y_test_inv.flatten(), lstm_pred_inv.flatten(), "LSTM")


## 📊 Exploratory Data Analysis (EDA)  
We use visualizations (Seaborn, Plotly) and statistical summaries to uncover trends in purchasing behavior, customer demographics, and seasonal effects.


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

# Prepare monthly sales data
sales = monthly_sales.values.reshape(-1, 1)
scaler = MinMaxScaler()
scaled_sales = scaler.fit_transform(sales)

# Create sequences for LSTM
def create_sequences(data, seq_len):
    x, y = [], []
    for i in range(len(data) - seq_len):
        x.append(data[i:i+seq_len])
        y.append(data[i+seq_len])
    return np.array(x), np.array(y)

seq_length = 12
X, y = create_sequences(scaled_sales, seq_length)

# Split into train and test
X_train, y_train = X[:-12], y[:-12]
X_test, y_test = X[-12:], y[-12:]

# Build and train LSTM model
model = Sequential([
    LSTM(64, activation='relu', input_shape=(seq_length, 1)),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=50, verbose=0)

# Forecast
lstm_preds = model.predict(X_test)
lstm_preds_rescaled = scaler.inverse_transform(lstm_preds)
y_test_rescaled = scaler.inverse_transform(y_test.reshape(-1,1))

# Plot results
plt.figure(figsize=(10,4))
plt.plot(y_test_rescaled, label='Actual')
plt.plot(lstm_preds_rescaled, label='LSTM Forecast')
plt.legend()
plt.title("LSTM Sales Forecast")
plt.show()

# Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error
print("LSTM RMSE:", mean_squared_error(y_test_rescaled, lstm_preds_rescaled, squared=False))
print("LSTM MAE:", mean_absolute_error(y_test_rescaled, lstm_preds_rescaled))


## 🧩 Customer Segmentation  
Using clustering algorithms like Gaussian Mixture Models and Agglomerative Clustering, we segment customers based on RFM and behavioral features.


In [None]:
# 5.1 Gaussian Mixture Model Clustering
X_seg = rfm[['Recency', 'Frequency', 'Monetary', 'Loyalty Score']]
gmm = GaussianMixture(n_components=4, random_state=0)
rfm['Segment'] = gmm.fit_predict(X_seg)

# 5.2 Visualize Clusters
sns.pairplot(rfm, hue='Segment', vars=['Recency', 'Frequency', 'Monetary'])
plt.suptitle("Customer Segments")
plt.show()

# 5.3 Customer Segment Profiling
segment_profiles = rfm.groupby('Segment').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'Loyalty Score': 'mean',
    'CLV': 'mean' if 'CLV' in rfm.columns else 'sum',
    'Avg_Discount_Utilization': 'mean' if 'Avg_Discount_Utilization' in rfm.columns else 'sum'
})

print("Segment Profiles (mean values):")
display(segment_profiles)

# Optional: View payment method preference counts
if 'Preferred_Payment_Method' in rfm.columns:
    payment_distribution = rfm.groupby('Segment')['Preferred_Payment_Method'].value_counts(normalize=True).unstack().fillna(0)
    print("Payment Preferences by Segment:")
    display(payment_distribution)


## 🔮 Sales Forecasting  
We forecast future sales using the Prophet time series model with custom seasonality and holiday effects.


In [None]:
# 6.1 Prepare for Prophet
sales_df = df[['Date of Purchase', 'Selling Price']].rename(columns={'Date of Purchase': 'ds', 'Selling Price': 'y'})
daily_sales = sales_df.groupby('ds').sum().reset_index()

# 6.2 Fit Model
model = Prophet()
model.fit(daily_sales)

# 6.3 Forecast Future
future = model.make_future_dataframe(periods=90)
forecast = model.predict(future)

# 6.4 Plot Forecast
model.plot(forecast)
plt.title("Sales Forecast (90 days ahead)")
plt.show()


## ⚠️ Customer Churn Prediction  
Using machine learning models like Random Forest, we predict which customers are at risk of churning.


In [None]:
# 7.1 Label Churn (no purchase in 90+ days)
last_purchase = df.groupby('Customer ID')['Date of Purchase'].max()
churn_threshold = df['Date of Purchase'].max() - timedelta(days=90)
rfm['Churn'] = last_purchase < churn_threshold
rfm['Churn'] = rfm['Churn'].astype(int)

# 7.2 Model Training
X = rfm[['Recency', 'Frequency', 'Monetary', 'Loyalty Score']]
y = rfm['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7.3 Evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
# Define X and y
X = df.drop(columns=['Churn'])
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
log_reg_preds = log_reg.predict(X_test)


In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)
dtree_preds = dtree.predict(X_test)


## 🛒 Market Basket Analysis & Cross-Selling  
In this final section, we apply Association Rule Mining using the Apriori algorithm to uncover product bundling opportunities and cross-selling strategies.


In [None]:
# 8.1 Create Basket Matrix
basket = df.groupby(['Invoice No', 'Product ID'])['Purchase Quantity'].sum().unstack().fillna(0)
basket = basket.applymap(lambda x: 1 if x > 0 else 0)

# 8.2 Association Rules
frequent_items = apriori(basket, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_items, metric="lift", min_threshold=1)
rules.sort_values(by='confidence', ascending=False).head()


## 📈 Seasonal Decomposition  
Here we apply STL decomposition to identify trends and seasonality in the time series sales data.


In [None]:
### Strategic Insights:

- **SARIMA and Prophet** show seasonal trends with a peak during festival months. Stock inventory accordingly.
- **LSTM** captures trend but might lag if data is noisy or highly seasonal.
- **Dip detected** in off-season months (e.g., April-June) — consider running promotional campaigns.
- **Recommendation:** Increase stock and marketing efforts during November-December and major holidays.


In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve

def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Evaluation:")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("AUC-ROC:", roc_auc_score(y_true, y_pred))

# Evaluate all models
evaluate_model("Logistic Regression", y_test, log_reg_preds)
evaluate_model("Decision Tree", y_test, dtree_preds)
evaluate_model("Random Forest", y_test, rf_preds)
evaluate_model("XGBoost", y_test, xgb_preds)


In [None]:
def plot_curves(model, X_test, y_test, label):
    y_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    precision, recall, _ = precision_recall_curve(y_test, y_score)

    plt.figure(figsize=(12, 5))

    # ROC Curve
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc_score(y_test, y_score):.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()

    # Precision-Recall Curve
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, label=label)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curve")
    plt.legend()

    plt.tight_layout()
    plt.show()

# Plot for each
plot_curves(log_reg, X_test, y_test, "Logistic Regression")
plot_curves(dtree, X_test, y_test, "Decision Tree")
plot_curves(rf_best, X_test, y_test, "Random Forest")
plot_curves(xgb_model, X_test, y_test, "XGBoost")
