In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Check versions
print("numpy", np.__version__)
print("pandas", pd.__version__)
print("sklearn", sklearn.__version__)

# Load the dataset
data = pd.read_csv("out.csv")

# Check for missing columns or mismatches
expected_columns = ['age', 'age_youngest_child', 'debt_equity', 'gender', 'bad_payment', 
                    'gold_card', 'pension_plan', 'household_debt_to_equity_ratio', 'income', 
                    'members_in_household', 'months_current_account', 'months_customer', 
                    'call_center_contacts', 'loan_accounts', 'number_products', 'number_transactions', 
                    'non_worker_percentage', 'white_collar_percentage', 'Mortgage', 'Pension', 'Savings', 'rfm_score']
missing_columns = [col for col in expected_columns if col not in data.columns]

if missing_columns:
    print(f"Missing columns in the dataset: {missing_columns}")
else:
    print("All expected columns are present.")

# Prepare data
X = data.drop('rfm_score', axis=1)  # Drop target variable
y = data['rfm_score']               # Target variable

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Standardize all features

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize models
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Polynomial Regression
degree = 2  # Example degree
poly_features = PolynomialFeatures(degree=degree)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)

# Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

# Ridge Regression
ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

# Decision Tree Regression
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{model_name}")
    print("MSE:", mean_squared_error(y_true, y_pred))
    print("R2 Score:", r2_score(y_true, y_pred))

evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_poly, "Polynomial Regression")
evaluate_model(y_test, y_pred_lasso, "Lasso Regression")
evaluate_model(y_test, y_pred_ridge, "Ridge Regression")
evaluate_model(y_test, y_pred_dt, "Decision Tree")
evaluate_model(y_test, y_pred_rf, "Random Forest")

# Save the trained models
joblib.dump(rf, 'random_forest_model.pkl')
print("Random Forest model saved.")

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved.")


numpy 2.1.3
pandas 2.2.3
sklearn 1.5.2
All expected columns are present.

Linear Regression
MSE: 6.750770659145714
R2 Score: 0.8701171539394337

Polynomial Regression
MSE: 3.6635187684539394
R2 Score: 0.9295149741758058

Lasso Regression
MSE: 6.833321498972284
R2 Score: 0.8685289000107006

Ridge Regression
MSE: 6.750776791509985
R2 Score: 0.870117035954545

Decision Tree
MSE: 1.7508605732426303
R2 Score: 0.9663139564665992

Random Forest
MSE: 0.9981844109095235
R2 Score: 0.9807952249116066
Random Forest model saved.
Scaler saved.
