In [42]:
import pandas as pd
import numpy as np

In [43]:
np.random.seed(42)
num_samples = 1000

Creating survey dataset

In [44]:
data = {
    'respondent_id': range(1, num_samples + 1),
    'satisfaction_rating': np.random.randint(1, 6, num_samples),  # Ratings from 1 to 5
    'recommendation': np.random.choice(['Yes', 'No'], num_samples),
    'primary_reason': np.random.choice(['Quality', 'Price', 'Customer Service', 'Features', 'Other'], num_samples)
}

In [45]:
survey_df = pd.DataFrame(data)
survey_df.to_csv('synthetic_survey_data.csv', index=False)

In [46]:
print(survey_df.head(10))

   respondent_id  satisfaction_rating recommendation    primary_reason
0              1                    4             No          Features
1              2                    5             No           Quality
2              3                    3             No  Customer Service
3              4                    5             No             Other
4              5                    5             No  Customer Service
5              6                    2            Yes  Customer Service
6              7                    3             No           Quality
7              8                    3            Yes           Quality
8              9                    3             No           Quality
9             10                    5            Yes             Price


In [7]:
# from google.colab import files
# files.download('synthetic_survey_data.csv')

Data exploration

In [8]:
print(survey_df.isnull().sum())

respondent_id          0
satisfaction_rating    0
recommendation         0
primary_reason         0
dtype: int64


In [9]:
survey_df = survey_df.dropna()

One Hot Encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
encoder=OneHotEncoder(sparse_output=False)
encoded_features=encoder.fit_transform(survey_df[['primary_reason', 'recommendation']])

In [12]:
encoded_df=pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['primary_reason', 'recommendation']))
survey_df=pd.concat([survey_df.drop(['primary_reason', 'recommendation'], axis=1), encoded_df], axis=1)

In [13]:
survey_df.head()

Unnamed: 0,respondent_id,satisfaction_rating,primary_reason_Customer Service,primary_reason_Features,primary_reason_Other,primary_reason_Price,primary_reason_Quality,recommendation_No,recommendation_Yes
0,1,4,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2,5,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,3,3,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,5,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,5,5,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Standardization

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
numerical_features=['satisfaction_rating']
scaler=StandardScaler()
survey_df[numerical_features]=scaler.fit_transform(survey_df[numerical_features])

In [16]:
survey_df.head()

Unnamed: 0,respondent_id,satisfaction_rating,primary_reason_Customer Service,primary_reason_Features,primary_reason_Other,primary_reason_Price,primary_reason_Quality,recommendation_No,recommendation_Yes
0,1,0.6953,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2,1.393393,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,3,-0.002792,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,1.393393,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,5,1.393393,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Train test split

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x=survey_df.drop(['respondent_id'], axis=1)
y=survey_df['satisfaction_rating']

In [19]:
x_train, x_val, y_train, y_val=train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape)
print(x_val.shape)

(800, 8)
(200, 8)


Model training

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
rf_model=RandomForestRegressor(random_state=42, n_estimators=1000)
rf_model.fit(x_train, y_train)

In [22]:
y_pred_rf=rf_model.predict(x_val)

Evaluate the Model

In [23]:
mse_rf=mean_squared_error(y_val, y_pred_rf)
r2_rf=r2_score(y_val, y_pred_rf)

In [24]:
print(mse_rf)
print(r2_rf)

4.518663620572654e-29
1.0


Hyperparameter Tuning

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [27]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [28]:
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

In [29]:
print(best_params)

{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [30]:
y_pred_best_rf = best_rf_model.predict(x_val)
mse_best_rf = mean_squared_error(y_val, y_pred_best_rf)
r2_best_rf = r2_score(y_val, y_pred_best_rf)

In [31]:
print(mse_best_rf)
print(r2_best_rf)

1.2439967054136317e-30
1.0


Cross-Validation

In [32]:
from sklearn.model_selection import cross_val_score

In [33]:
cv_scores = cross_val_score(best_rf_model, x, y, cv=5, scoring='r2')

In [34]:
print("Cross-Validation R^2 Scores:", cv_scores)
print("Mean R^2 Score:", cv_scores.mean())

Cross-Validation R^2 Scores: [1. 1. 1. 1. 1.]
Mean R^2 Score: 1.0


Saving the Model

In [35]:
import joblib
joblib.dump(best_rf_model, 'best_rf_model.pkl')

['best_rf_model.pkl']

In [37]:
# from google.colab import files
# files.download('best_rf_model.pkl')

In [39]:
import json

column_names = list(x_train.columns)
with open('column_names.json', 'w') as f:
    json.dump(column_names, f)