In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('../Data/clean_data.csv')

# split the data 

In [3]:
X = df.drop(['Customer Satisfaction Rating'], axis = 1)
y = df['Customer Satisfaction Rating']

# import train test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Load all required model

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score

In [6]:
models = {
    'LinearRegression' : LinearRegression(),
    'RandomForestRegressor' : RandomForestRegressor(),
    'GradientBoostingRegressor' : GradientBoostingRegressor(),
    'SVC' : SVC(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'KNeighborsRegressor' : KNeighborsRegressor()
}

# Train all models 

In [7]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    n = X_test.shape[0]
    p = X_test.shape[1]
    adj_r2 = 1 - ((1 - r2) * (n - 1))/ (n - p - 1)

    MSE = mean_squared_error(y_test, y_pred)
    MAE = mean_absolute_error(y_test, y_pred)

    cv_score = cross_val_score(model, X, y, cv = 5)
    print(f'{name} model r2 score is {r2}, adj_r2 is {adj_r2}, MSE is {MSE}, MAE is {MAE}')
    print(f'{name} model cross validation is score is {cv_score}')
    print('                           ')

LinearRegression model r2 score is 0.31462802687397773, adj_r2 is 0.30584120670569537, MSE is 1.3519167391158071, MAE is 1.0301822861186811
LinearRegression model cross validation is score is [0.34710781 0.28106948 0.30684423 0.31341323 0.27254344]
                           
RandomForestRegressor model r2 score is 0.8893507163020874, adj_r2 is 0.8879321357418577, MSE is 0.2182590252707581, MAE is 0.39987364620938626
RandomForestRegressor model cross validation is score is [0.89147334 0.89006845 0.89346676 0.89096973 0.8890759 ]
                           
GradientBoostingRegressor model r2 score is 0.8933951664536246, adj_r2 is 0.8920284378184147, MSE is 0.21028122624369286, MAE is 0.40998431208961394
GradientBoostingRegressor model cross validation is score is [0.87964889 0.88863947 0.89639578 0.89244723 0.88587535]
                           
SVC model r2 score is -0.6444274125744751, adj_r2 is -0.6655098152997889, MSE is 3.243682310469314, MAE is 1.431407942238267
SVC model cross v

# Creating pipline for model saving

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from imblearn.pipeline import Pipeline as ImbPipeline 


df = pd.read_csv('../Data/pipline.csv')


target_col = 'Customer Satisfaction Rating' 
num_col = ['Customer Age', 'Days_Since_Purchase']
obj_col = ['Customer Gender', 'Product Purchased', 'Ticket Subject', 'Satisfaction_Level']

X = df[num_col + obj_col]
y = df[target_col]  

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_col),
    ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), obj_col)
])

pipeline = ImbPipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", GradientBoostingRegressor())
])

pipeline.fit(X_train, y_train)

joblib.dump(pipeline, "model.pkl") 
print("Full pipeline saved successfully!")
