In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('online_retail.csv')

# Check the first few rows
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [None]:
# seeing the null values in each column
print(df.isnull().sum())

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64


In [None]:
df.dropna(subset=['CustomerID'], inplace=True)

In [None]:
# Remove any duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['InvoiceDate'].info()

<class 'pandas.core.series.Series'>
Index: 401604 entries, 0 to 541908
Series name: InvoiceDate
Non-Null Count   Dtype         
--------------   -----         
401604 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 6.1 MB


In [None]:
df.to_csv('my_data.csv', index=False)

In [None]:
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [None]:
df = df[df['Quantity']>0]

In [None]:
df_customer = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (df['InvoiceDate'].max() - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',                                             # Frequency
    'TotalPrice': 'sum',                                                # Monetary
    'Quantity': 'sum'                                                   # Total Quantity Purchased
}).reset_index()

In [None]:
df_customer.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary', 'TotalQuantity']
df_customer['AvgOrderValue'] = df_customer['Monetary'] / df_customer['Frequency']

In [None]:
df_customer['CustomerLifespan'] = df_customer['Frequency'] * (365 / df_customer['Recency'])


In [None]:
print(df_customer['CustomerLifespan'])

0          1.123077
1       2555.000000
2         19.729730
3         20.277778
4          1.181230
           ...     
4334       1.317690
4335       2.027778
4336     104.285714
4337    1946.666667
4338      26.071429
Name: CustomerLifespan, Length: 4339, dtype: float64


In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Normalize Monetary and Frequency using z-score normalization
df_customer['MonetaryZ'] = scaler.fit_transform(df_customer[['Monetary']])
df_customer['FrequencyZ'] = scaler.fit_transform(df_customer[['Frequency']])

# Recalculate CLV using normalized metrics
df_customer['CLV'] = df_customer['MonetaryZ'] * df_customer['FrequencyZ'] * df_customer['CustomerLifespan']
df_customer.head()


Unnamed: 0,CustomerID,Recency,Frequency,Monetary,TotalQuantity,AvgOrderValue,CustomerLifespan,MonetaryZ,FrequencyZ,CLV
0,12346.0,325,1,77183.6,74215,77183.6,1.123077,8.363977,-0.424675,-3.989136
1,12347.0,1,7,4310.0,2458,615.714286,2555.0,0.251779,0.35408,227.778108
2,12348.0,74,4,1797.24,2341,449.31,19.72973,-0.027938,-0.035297,0.019456
3,12349.0,18,1,1757.55,631,1757.55,20.277778,-0.032357,-0.424675,0.278637
4,12350.0,309,1,334.4,197,334.4,1.18123,-0.19078,-0.424675,0.095703


In [None]:
import numpy as np

# Apply log transformation to Monetary and Frequency
df_customer['MonetaryLog'] = np.log1p(df_customer['Monetary'])  # log1p is log(1 + x)
df_customer['FrequencyLog'] = np.log1p(df_customer['Frequency'])

# Recalculate CLV using log-transformed metrics
df_customer['CLV'] = df_customer['MonetaryLog'] * df_customer['FrequencyLog'] * df_customer['CustomerLifespan']
df_customer.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,TotalQuantity,AvgOrderValue,CustomerLifespan,MonetaryZ,FrequencyZ,CLV,MonetaryLog,FrequencyLog
0,12346.0,325,1,77183.6,74215,77183.6,1.123077,8.363977,-0.424675,8.760727,11.253955,0.693147
1,12347.0,1,7,4310.0,2458,615.714286,2555.0,0.251779,0.35408,44463.874656,8.368925,2.079442
2,12348.0,74,4,1797.24,2341,449.31,19.72973,-0.027938,-0.035297,237.980689,7.494564,1.609438
3,12349.0,18,1,1757.55,631,1757.55,20.277778,-0.032357,-0.424675,105.026022,7.472245,0.693147
4,12350.0,309,1,334.4,197,334.4,1.18123,-0.19078,-0.424675,4.76139,5.815324,0.693147


In [None]:
from sklearn.model_selection import train_test_split

# Define the feature matrix (X) and target variable (y)
X = df_customer[['Recency', 'FrequencyLog', 'MonetaryLog', 'TotalQuantity', 'AvgOrderValue', 'CustomerLifespan']]
y = df_customer['CLV']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
import numpy as np

# Check for missing or infinite values in y_train
print(y_train.isnull().sum())  # Check for NaN values
print(np.isinf(y_train).sum())  # Check for infinite values


0
64


In [None]:
# Remove rows with NaN or infinite values
X_train = X_train[~y_train.isin([np.nan, np.inf, -np.inf])]
y_train = y_train[~y_train.isin([np.nan, np.inf, -np.inf])]

# Or, alternatively, you can fill NaN values (if applicable)
y_train.fillna(y_train.median(), inplace=True)
print(X_train.dtypes)
print(y_train.dtypes)



Recency               int64
FrequencyLog        float64
MonetaryLog         float64
TotalQuantity         int64
AvgOrderValue       float64
CustomerLifespan    float64
dtype: object
float64


In [None]:
import numpy as np

# Check for missing or infinite values in X_test
print(X_test.isnull().sum())  # Check for NaN values in each column
print(np.isinf(X_test).sum())  # Check for infinite values in each column


Recency             0
FrequencyLog        0
MonetaryLog         0
TotalQuantity       0
AvgOrderValue       0
CustomerLifespan    0
dtype: int64
Recency              0
FrequencyLog         0
MonetaryLog          0
TotalQuantity        0
AvgOrderValue        0
CustomerLifespan    29
dtype: int64


In [None]:
print(X.isnull().sum())  # Check for missing values (NaN)
print(np.isinf(X).sum())  # Check for infinity
print((X > 1e12).sum())  # Check for unusually large values


Recency             0
FrequencyLog        0
MonetaryLog         0
TotalQuantity       0
AvgOrderValue       0
CustomerLifespan    0
dtype: int64
Recency              0
FrequencyLog         0
MonetaryLog          0
TotalQuantity        0
AvgOrderValue        0
CustomerLifespan    93
dtype: int64
Recency              0
FrequencyLog         0
MonetaryLog          0
TotalQuantity        0
AvgOrderValue        0
CustomerLifespan    93
dtype: int64


In [None]:
X['CustomerLifespan'].fillna(X['CustomerLifespan'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['CustomerLifespan'].fillna(X['CustomerLifespan'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['CustomerLifespan'].fillna(X['CustomerLifespan'].median(), inplace=True)


In [None]:
print(X.isnull().sum())


Recency             0
FrequencyLog        0
MonetaryLog         0
TotalQuantity       0
AvgOrderValue       0
CustomerLifespan    0
dtype: int64


In [None]:
print(np.isinf(X_test).sum())  # Check for infinity values


Recency              0
FrequencyLog         0
MonetaryLog          0
TotalQuantity        0
AvgOrderValue        0
CustomerLifespan    29
dtype: int64


In [None]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.median(), inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)


In [None]:
X['CustomerLifespan'] = X['CustomerLifespan'].fillna(X['CustomerLifespan'].median())
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.median(), inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['CustomerLifespan'] = X['CustomerLifespan'].fillna(X['CustomerLifespan'].median())
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)


In [None]:
print("Missing values:\n", X.isnull().sum())
print("Infinity values:\n", np.isinf(X).sum())
print("Max value in X:\n", X.max())


Missing values:
 Recency             0
FrequencyLog        0
MonetaryLog         0
TotalQuantity       0
AvgOrderValue       0
CustomerLifespan    0
dtype: int64
Infinity values:
 Recency             0
FrequencyLog        0
MonetaryLog         0
TotalQuantity       0
AvgOrderValue       0
CustomerLifespan    0
dtype: int64
Max value in X:
 Recency                373.000000
FrequencyLog             5.351858
MonetaryLog             12.543284
TotalQuantity       197491.000000
AvgOrderValue        84236.250000
CustomerLifespan     45260.000000
dtype: float64


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define feature matrix (X) and target variable (y)
X = df_customer[['Recency', 'FrequencyLog', 'MonetaryLog', 'TotalQuantity', 'AvgOrderValue', 'CustomerLifespan']]
y = df_customer['CLV']

# Handle missing values in X
X['CustomerLifespan'] = X['CustomerLifespan'].fillna(X['CustomerLifespan'].median())  # Impute missing values
X.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities
X.fillna(X.median(), inplace=True)  # Impute remaining NaN
X = X.clip(lower=X.quantile(0.01), upper=X.quantile(0.99), axis=1)  # Clip extreme values

# Handle issues in y
y.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities
y.fillna(y.median(), inplace=True)  # Impute missing values
y = y.clip(lower=y.quantile(0.01), upper=y.quantile(0.99))  # Clip extreme values

# Validate dataset cleanliness
print("Missing values in X:\n", X.isnull().sum())
print("Missing values in y:", y.isnull().sum())
print("Infinity values in y:", np.isinf(y).sum())
print("Max value in y:", y.max())
print("Min value in y:", y.min())

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)

# Evaluation
print("Train RMSE:", mean_squared_error(y_train, y_pred_train_lr, squared=False))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test_lr, squared=False))
print("R² Score (Test):", r2_score(y_test, y_pred_test_lr))


Missing values in X:
 Recency             0
FrequencyLog        0
MonetaryLog         0
TotalQuantity       0
AvgOrderValue       0
CustomerLifespan    0
dtype: int64
Missing values in y: 0
Infinity values in y: 0
Max value in y: 110028.06386620701
Min value in y: 3.5051561219222176
Train RMSE: 2911.7167794092666
Test RMSE: 3191.2580498476686
R² Score (Test): 0.9429646095720905


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['CustomerLifespan'] = X['CustomerLifespan'].fillna(X['CustomerLifespan'].median())  # Impute missing values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)  # Impute remaining NaN
  X = X.clip(lower=X.quantile(0.01), upper=X.quantile(0.99), axis=1)  # Clip 

In [None]:
# Train the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)

# Evaluation
print("Train RMSE:", mean_squared_error(y_train, y_pred_train_lr, squared=False))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test_lr, squared=False))
print("R² Score (Test):", r2_score(y_test, y_pred_test_lr))

Train RMSE: 2911.7167794092666
Test RMSE: 3191.2580498476686
R² Score (Test): 0.9429646095720905




In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest model
rf = RandomForestRegressor(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

# Predictions
y_pred_train_rf = rf.predict(X_train)
y_pred_test_rf = rf.predict(X_test)

# Evaluation
print("Random Forest:")
print("Train RMSE:", mean_squared_error(y_train, y_pred_train_rf, squared=False))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test_rf, squared=False))
print("R² Score (Test):", r2_score(y_test, y_pred_test_rf))


Random Forest:
Train RMSE: 383.8407967065872
Test RMSE: 1462.8432234959066
R² Score (Test): 0.9880156042860384




In [None]:
print(df_customer.columns)


Index(['CustomerID', 'Recency', 'Frequency', 'Monetary', 'TotalQuantity',
       'AvgOrderValue', 'CustomerLifespan', 'MonetaryZ', 'FrequencyZ', 'CLV',
       'MonetaryLog', 'FrequencyLog'],
      dtype='object')


In [None]:
# Assuming the main dataset is named df_main
df_customer = df_customer.merge(df[['CustomerID', 'Country']], on='CustomerID', how='left')

# Verify the `Country` column is added
print(df_customer.head())


   CustomerID  Recency  Frequency  Monetary  TotalQuantity  AvgOrderValue  \
0     12346.0      325          1   77183.6          74215   77183.600000   
1     12347.0        1          7    4310.0           2458     615.714286   
2     12347.0        1          7    4310.0           2458     615.714286   
3     12347.0        1          7    4310.0           2458     615.714286   
4     12347.0        1          7    4310.0           2458     615.714286   

   CustomerLifespan  MonetaryZ  FrequencyZ           CLV  MonetaryLog  \
0          1.123077   8.363977   -0.424675      8.760727    11.253955   
1       2555.000000   0.251779    0.354080  44463.874656     8.368925   
2       2555.000000   0.251779    0.354080  44463.874656     8.368925   
3       2555.000000   0.251779    0.354080  44463.874656     8.368925   
4       2555.000000   0.251779    0.354080  44463.874656     8.368925   

   FrequencyLog         Country  
0      0.693147  United Kingdom  
1      2.079442         Icelan

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

#H₀: The distribution of CLV_Category is independent of the Country.
#H₁: The distribution of CLV_Category depends on the Country.

# Step 1: Bin CLV into categories (Low, Medium, High)
df_customer['CLV_Category'] = pd.cut(
    df_customer['CLV'],
    bins=[-float("inf"), df_customer['CLV'].quantile(0.33), df_customer['CLV'].quantile(0.66), float("inf")],
    labels=['Low', 'Medium', 'High']
)

# Step 2: Create a contingency table for CLV_Category and Country
contingency_table = pd.crosstab(df_customer['CLV_Category'], df_customer['Country'])

print("Contingency Table:")
print(contingency_table)

# Step 3: Perform Chi-Square Test
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

# Results
print("\nChi-Square Statistic:", chi2_stat)
print("Degrees of Freedom:", dof)
print("P-value:", p_value)
print("\nExpected Frequencies Table:")
print(expected)

# Step 4: Decision based on p-value
if p_value < 0.05:
    print("\nThere is a significant association between CLV categories and Country.")
else:
    print("\nThere is no significant association between CLV categories and Country.")


Contingency Table:
Country       Australia  Austria  Bahrain  Belgium  Brazil  Canada  \
CLV_Category                                                         
Low                 187      350       17      665      32     151   
Medium              281       29        0      746       0       0   
High                716       19        0      620       0       0   

Country       Channel Islands  Cyprus  Czech Republic  Denmark  ...  RSA  \
CLV_Category                                                    ...        
Low                       342     156              25      293  ...   58   
Medium                    364     447               0       87  ...    0   
High                       41       0               0        0  ...    0   

Country       Saudi Arabia  Singapore  Spain  Sweden  Switzerland  USA  \
CLV_Category                                                             
Low                      9          0    694      91         1334  134   
Medium                   0 

In [None]:
#t test
import pandas as pd
from scipy.stats import ttest_ind

# Select two countries to compare (e.g., USA and Australia)
country1 = "USA"
country2 = "Australia"

# Filter data for the selected countries
data_country1 = df_customer[df_customer['Country'] == country1]['CLV']
data_country2 = df_customer[df_customer['Country'] == country2]['CLV']

# Perform Two-Sample T-Test
t_stat, p_value = ttest_ind(data_country1, data_country2, equal_var=False)  # Use equal_var=False if variances differ

# Results
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_value}")

# Decision based on P-Value
if p_value < 0.05:
    print(f"\nThere is a significant difference in average CLV between {country1} and {country2}.")
else:
    print(f"\nThere is no significant difference in average CLV between {country1} and {country2}.")


T-Statistic: -50.26879726739643
P-Value: 1.04915711094077e-310

There is a significant difference in average CLV between USA and Australia.


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gbr.fit(X_train, y_train)

# Predictions
y_pred_train_gbr = gbr.predict(X_train)
y_pred_test_gbr = gbr.predict(X_test)

# Evaluation
print("Gradient Boosting Decision Tree:")
print("Train RMSE:", mean_squared_error(y_train, y_pred_train_gbr, squared=False))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test_gbr, squared=False))
print("R² Score (Test):", r2_score(y_test, y_pred_test_gbr))


Gradient Boosting Decision Tree:
Train RMSE: 289.2857692977802
Test RMSE: 1346.976361620969
R² Score (Test): 0.9898389047909429




In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# Initialize the base estimator for AdaBoost
base_estimator = DecisionTreeRegressor(max_depth=3, random_state=42)

# Initialize AdaBoost Regressor
abr = AdaBoostRegressor(
    estimator=base_estimator,  # Use 'estimator' instead of 'base_estimator'
    n_estimators=50,          # Number of boosting rounds
    learning_rate=0.1,        # Weight applied to each regressor
    random_state=42
)

# Train the AdaBoost Regressor
abr.fit(X_train, y_train)

# Predictions
y_pred_train_abr = abr.predict(X_train)
y_pred_test_abr = abr.predict(X_test)

# Evaluation
print("AdaBoost Regression:")
print("Train RMSE:", mean_squared_error(y_train, y_pred_train_abr, squared=False))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test_abr, squared=False))
print("R² Score (Test):", r2_score(y_test, y_pred_test_abr))


AdaBoost Regression:
Train RMSE: 1571.2997219237554
Test RMSE: 2557.8895737222188
R² Score (Test): 0.9633575793923197




In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd

# Assuming X and y are preprocessed and ready

# Check if y is continuous using pandas
if isinstance(y, pd.Series) and pd.api.types.is_numeric_dtype(y):
    print("Target variable is continuous. Converting to categorical labels...")

    # Define bins and labels (example for numeric ranges)
    bins = [0, 100, 500, 1000, np.inf]  # Adjust bins based on your data range
    labels = ['Low', 'Medium', 'High', 'Very High']  # Customize labels as needed

    # Convert continuous y to categorical
    y = pd.cut(y, bins=bins, labels=labels)

# Encode categorical labels as integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Target variable is continuous. Converting to categorical labels...
Accuracy: 0.7649769585253456

Classification Report:
              precision    recall  f1-score   support

        High       0.28      0.69      0.40        64
         Low       0.92      0.91      0.91       402
      Medium       0.69      0.59      0.64       189
   Very High       0.95      0.68      0.79       213

    accuracy                           0.76       868
   macro avg       0.71      0.72      0.68       868
weighted avg       0.83      0.76      0.78       868


Confusion Matrix:
[[ 44   1  15   4]
 [  5 364  33   0]
 [ 43  31 112   3]
 [ 66   0   3 144]]


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd

# Assuming X and y are preprocessed and ready

# Check if y is continuous using pandas
if isinstance(y, pd.Series) and pd.api.types.is_numeric_dtype(y):
    print("Target variable is continuous. Converting to categorical labels...")

    # Define bins and labels (example for numeric ranges)
    bins = [0, 100, 500, 1000, np.inf]  # Adjust bins based on your data range
    labels = ['Low', 'Medium', 'High', 'Very High']  # Customize labels as needed

    # Convert continuous y to categorical
    y = pd.cut(y, bins=bins, labels=labels)

# Encode categorical labels as integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Feature scaling (important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9308755760368663

Classification Report:
              precision    recall  f1-score   support

        High       0.85      0.61      0.71        64
         Low       0.96      0.99      0.97       402
      Medium       0.83      0.86      0.85       189
   Very High       0.98      0.99      0.98       213

    accuracy                           0.93       868
   macro avg       0.90      0.86      0.88       868
weighted avg       0.93      0.93      0.93       868


Confusion Matrix:
[[ 39   0  25   0]
 [  0 396   6   0]
 [  6  15 163   5]
 [  1   0   2 210]]


In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline to scale data and train the SVR model
svr = make_pipeline(
    StandardScaler(),  # Scale features for better performance of SVM
    SVR(kernel='rbf', C=1.0, epsilon=0.1)  # SVM with RBF kernel
)

# Train the SVR model
svr.fit(X_train, y_train)

# Predictions
y_pred_train_svr = svr.predict(X_train)
y_pred_test_svr = svr.predict(X_test)

# Evaluation
train_rmse_svr = mean_squared_error(y_train, y_pred_train_svr, squared=False)
test_rmse_svr = mean_squared_error(y_test, y_pred_test_svr, squared=False)

train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("SVM Regression:")
print("Train RMSE:", train_rmse_svr)
print("Test RMSE:", test_rmse_svr)
print("Train MSE:", train_mse_svr)
print("Test MSE:", test_mse_svr)
print("R² Score (Test):", r2_score(y_test, y_pred_test_svr))


SVM Regression:
Train RMSE: 0.6740102410596863
Test RMSE: 0.6861020458499968
Train MSE: 0.45428980505333644
Test MSE: 0.47073601731955117
R² Score (Test): 0.4589737405204487


