In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

data = pd.read_csv(r"C:\Users\racha\Downloads\customer_support_tickets (1).csv")
data.columns = [c.strip().replace(' ', '_') for c in data.columns]
data = data.dropna(subset=['Customer_Satisfaction_Rating'])
data['Customer_Satisfaction_Rating'] = data['Customer_Satisfaction_Rating'].astype(int)

for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].fillna('Unknown')
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

data = data.fillna(data.mean(numeric_only=True))

plt.figure(figsize=(10,6))
sns.histplot(data['Customer_Satisfaction_Rating'], bins=5, kde=True)
plt.title('Customer Satisfaction Rating Distribution')
plt.show()

plt.figure(figsize=(8,6))
sns.countplot(x='Customer_Gender', data=data)
plt.title('Gender Distribution')
plt.show()

plt.figure(figsize=(10,6))
sns.histplot(data['Customer_Age'], bins=20, kde=True)
plt.title('Customer Age Distribution')
plt.show()

X = data.drop(['Customer_Satisfaction_Rating'], axis=1)
y_multi = data['Customer_Satisfaction_Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y_multi, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model_multi = RandomForestClassifier(random_state=42)
model_multi.fit(X_train, y_train)
y_pred_multi = model_multi.predict(X_test)

print("Multiclass Classification")
print("Accuracy:", accuracy_score(y_test, y_pred_multi))
print(classification_report(y_test, y_pred_multi))
print(confusion_matrix(y_test, y_pred_multi))

feature_importances = pd.Series(model_multi.feature_importances_, index=X.columns)
plt.figure(figsize=(10,6))
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Features - Multiclass Model')
plt.show()

y_binary = (data['Customer_Satisfaction_Rating'] >= 4).astype(int)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X, y_binary, test_size=0.2, random_state=42)
X_train_b = scaler.fit_transform(X_train_b)
X_test_b = scaler.transform(X_test_b)

model_binary = RandomForestClassifier(random_state=42)
model_binary.fit(X_train_b, y_train_b)
y_pred_b = model_binary.predict(X_test_b)

print("Binary Classification")
print("Accuracy:", accuracy_score(y_test_b, y_pred_b))
print(classification_report(y_test_b, y_pred_b))
print(confusion_matrix(y_test_b, y_pred_b))

feature_importances_b = pd.Series(model_binary.feature_importances_, index=X.columns)
plt.figure(figsize=(10,6))
feature_importances_b.nlargest(10).plot(kind='barh')
plt.title('Top 10 Features - Binary Model')
plt.show()
