In [None]:
print("hello")

hello


In [None]:
import pandas as pd
df = pd.read_csv("proc_ds.csv", encoding="ISO-8859-1")
print("Dataset Loaded Successfully!")
df.head()

Dataset Loaded Successfully!


Unnamed: 0,Grievance ID,Category,Grievance Description,Customer Feedback,Urgency Level,Complaint Keywords
0,G0001,medical and health,My son is vomiting again and again. Age ï¿½ï¿½...,Thank you @RailwaySeva @RailMinIndia Due to yo...,high,"son, vomiting, age 13 years"
1,G0002,maintenance,Train no .22987...Ajmer to Agra...windows not ...,Thanks for the prompt service..Problem has bee...,medium,"windows, not having mirror"
2,G0003,maintenance,"Dear Team, Charging point of coach no. B6/42/M...",Complaint has been Solved... Thanks for Quick ...,medium,"charging point, not working"
3,G0004,train operations,"Train no 12360, hundreds of without ticket pas...",Thanks a lot. 2 officers came and finally I go...,high,"without ticket, passengers flooding everywhere"
4,G0005,maintenance,I'm in SS class My PNR is 2326704127 and this ...,Thanks @RailMadad @RailwaySeva @DrmAjmer compl...,medium,"windows, can't be locked"


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Drop rows with missing values in required columns
df = df.dropna(subset=["Grievance Description", "Category", "Urgency Level"])

# Encode Category
category_encoder = LabelEncoder()
df["Category_Encoded"] = category_encoder.fit_transform(df["Category"])

# Encode Urgency Level
urgency_encoder = LabelEncoder()
df["Urgency_Encoded"] = urgency_encoder.fit_transform(df["Urgency Level"])

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X_text = tfidf.fit_transform(df["Grievance Description"])

print("Preprocessing Done!")


Preprocessing Done!


In [None]:
category_urgency_counts = df.groupby(["Category", "Urgency Level"]).size().unstack(fill_value=0)
print(category_urgency_counts)

Urgency Level       high  low  medium
Category                             
catering              21   49      45
customer service      10   63      48
housekeeping          63   17      75
luggage                7    4      15
maintenance           44   20      60
medical and health    26    2       7
security              19    4      18
train operations      19   37      30


In [None]:
# Identify unexpected urgency levels
unexpected_urgency = df[~df["Urgency Level"].isin(["high", "medium", "low"])]

# Display the row(s) with unexpected urgency levels
print(unexpected_urgency)


Empty DataFrame
Columns: [Grievance ID, Category, Grievance Description, Customer Feedback, Urgency Level, Complaint Keywords, Category_Encoded, Urgency_Encoded]
Index: []


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, RFE, SelectKBest, chi2, f_classif
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
import nltk
import re
import warnings
warnings.filterwarnings('ignore')

In [None]:

# Download NLTK resources if not already downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load data
print("Loading and preprocessing data...")
df = pd.read_csv("proc_ds.csv", encoding="ISO-8859-1")
df = df.dropna(subset=['Grievance Description', 'Urgency Level'])

# Map urgency levels to numerical values
df["Urgency Level"] = df["Urgency Level"].map({"low": 0, "medium": 1, "high": 2})
df = df.dropna(subset=['Urgency Level'])

# Display class distribution
print("Class distribution before sampling:")
print(df["Urgency Level"].value_counts(normalize=True).sort_index() * 100)

# Preprocess text data
df["Processed Description"] = df["Grievance Description"].apply(preprocess_text)

# Feature engineering
print("Extracting features...")



Loading and preprocessing data...
Class distribution before sampling:
Urgency Level
0    27.880512
1    42.389758
2    29.729730
Name: proportion, dtype: float64
Extracting features...


In [None]:
# TF-IDF features with optimized parameters
tfidf_vect = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    max_features=10000,
    min_df=2,
    max_df=0.9,
    use_idf=True,
    sublinear_tf=True  # Apply sublinear tf scaling (logarithmic)
)
X_tfidf = tfidf_vect.fit_transform(df["Processed Description"])

In [None]:

# Count vectorizer with different parameters for additional perspective
count_vect = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    max_features=8000,
    min_df=2,
    max_df=0.9
)
X_count = count_vect.fit_transform(df["Processed Description"])

In [None]:


# Extract text-based features
text_features = np.array([
    [
        len(text),                                       # Length of text
        len(text.split()),                               # Word count
        sum(1 for c in text if c.isupper()),             # Uppercase count
        text.count('!'),                                 # Exclamation count
        text.count('?'),                                 # Question mark count
        sum(1 for c in text if c.isdigit()),             # Digit count
        len(re.findall(r'\b(?:urgent|immediate|critical|emergency|asap|quickly|serious)\b', text.lower()))  # Urgency terms
    ]
    for text in df["Grievance Description"]
])

# Normalize text features
scaler = StandardScaler()
text_features_scaled = scaler.fit_transform(text_features)

In [None]:



# Combine all features
X_tfidf_array = X_tfidf.toarray()
X_count_array = X_count.toarray()
X = np.hstack((X_tfidf_array, X_count_array, text_features_scaled))
y = df["Urgency Level"].values

# Feature selection to reduce dimensionality and improve performance
print("Performing feature selection...")
selector = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold='median'
)
X = selector.fit_transform(X, y)
print(f"Features reduced from {X_tfidf_array.shape[1] + X_count_array.shape[1] + text_features_scaled.shape[1]} to {X.shape[1]}")


Performing feature selection...
Features reduced from 3722 to 1861


In [None]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Explore different sampling techniques
print("Applying advanced resampling...")
# Try different samplers and choose the best
samplers = {
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'SMOTETomek': SMOTETomek(random_state=42),
    'SMOTEENN': SMOTEENN(random_state=42)
}

best_sampler = None
best_score = 0

for name, sampler in samplers.items():
    print(f"Testing {name}...")
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)

    # Quick evaluation with Random Forest
    cv_scores = cross_val_score(
        RandomForestClassifier(n_estimators=100, random_state=42),
        X_resampled, y_resampled,
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1_macro'
    )

    mean_score = cv_scores.mean()
    print(f"{name} cross-validation f1-macro: {mean_score:.4f}")

    if mean_score > best_score:
        best_score = mean_score
        best_sampler = name

print(f"Selected best sampler: {best_sampler}")
X_train, y_train = samplers[best_sampler].fit_resample(X_train, y_train)

print("Class distribution after resampling:")
print(pd.Series(y_train).value_counts(normalize=True).sort_index() * 100)

Applying advanced resampling...
Testing SMOTE...
SMOTE cross-validation f1-macro: 0.6419
Testing ADASYN...
ADASYN cross-validation f1-macro: 0.6152
Testing SMOTETomek...
SMOTETomek cross-validation f1-macro: 0.6492
Testing SMOTEENN...
SMOTEENN cross-validation f1-macro: 0.7798
Selected best sampler: SMOTEENN
Class distribution after resampling:
0    47.867299
1    13.744076
2    38.388626
Name: proportion, dtype: float64


In [None]:
# Model definition and tuning
print("Training models...")

# Define models with improved parameters
xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    learning_rate=0.03,
    n_estimators=500,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1,
    scale_pos_weight=1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,
    class_weight='balanced',
    random_state=42
)

gb = GradientBoostingClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    random_state=42
)

svc = SVC(
    C=10.0,
    kernel='rbf',
    gamma='scale',
    probability=True,
    class_weight='balanced',
    random_state=42
)

mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='adaptive',
    max_iter=300,
    random_state=42
)

# Stack of base models with meta-learner for better ensemble
from sklearn.ensemble import StackingClassifier

# Train base models
base_models = {
    'xgb': xgb,
    'rf': rf,
    'gb': gb,
    'svc': svc,
    'mlp': mlp
}

print("Training base models individually...")
base_predictions = {}
base_models_trained = {}

for name, model in base_models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    base_models_trained[name] = model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    base_predictions[name] = y_pred
    print(f"{name} - Accuracy: {accuracy:.4f}, F1 Macro: {f1:.4f}")
    print(classification_report(y_test, y_pred))


Training models...
Training base models individually...
Training xgb...
xgb - Accuracy: 0.4539, F1 Macro: 0.4479
              precision    recall  f1-score   support

           0       0.43      0.59      0.50        39
           1       0.56      0.23      0.33        60
           2       0.43      0.64      0.51        42

    accuracy                           0.45       141
   macro avg       0.47      0.49      0.45       141
weighted avg       0.49      0.45      0.43       141

Training rf...
rf - Accuracy: 0.4752, F1 Macro: 0.4784
              precision    recall  f1-score   support

           0       0.46      0.62      0.53        39
           1       0.49      0.35      0.41        60
           2       0.48      0.52      0.50        42

    accuracy                           0.48       141
   macro avg       0.48      0.50      0.48       141
weighted avg       0.48      0.48      0.47       141

Training gb...
gb - Accuracy: 0.4468, F1 Macro: 0.4438
              p

In [None]:
# Create a stacked ensemble model
print("Training stacked ensemble...")
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('rf', rf),
        ('gb', gb),
        ('svc', svc),
        ('mlp', mlp)
    ],
    final_estimator=XGBClassifier(
        objective='multi:softprob',
        num_class=3,
        learning_rate=0.03,
        n_estimators=200,
        random_state=42
    ),
    cv=5,
    stack_method='predict_proba',
    verbose=0
)

stacking_model.fit(X_train, y_train)
stack_pred = stacking_model.predict(X_test)
stack_accuracy = accuracy_score(y_test, stack_pred)
stack_f1 = f1_score(y_test, stack_pred, average='macro')

print("\nStacked Ensemble Results:")
print(f"Accuracy: {stack_accuracy:.4f}, F1 Macro: {stack_f1:.4f}")
print(classification_report(y_test, stack_pred))

Training stacked ensemble...

Stacked Ensemble Results:
Accuracy: 0.4681, F1 Macro: 0.4686
              precision    recall  f1-score   support

           0       0.49      0.46      0.47        39
           1       0.55      0.40      0.46        60
           2       0.40      0.57      0.47        42

    accuracy                           0.47       141
   macro avg       0.48      0.48      0.47       141
weighted avg       0.49      0.47      0.47       141



In [None]:
# Create voting ensemble
print("Training voting ensemble...")
voting_model = VotingClassifier(
    estimators=[
        ('xgb', base_models_trained['xgb']),
        ('rf', base_models_trained['rf']),
        ('gb', base_models_trained['gb']),
        ('svc', base_models_trained['svc']),
        ('mlp', base_models_trained['mlp'])
    ],
    voting='soft',  # Use probability-based voting
    weights=[3, 2, 2, 1, 1]  # Weight models by their individual performance
)

voting_model.fit(X_train, y_train)
voting_pred = voting_model.predict(X_test)
voting_accuracy = accuracy_score(y_test, voting_pred)
voting_f1 = f1_score(y_test, voting_pred, average='macro')

print("\nVoting Ensemble Results:")
print(f"Accuracy: {voting_accuracy:.4f}, F1 Macro: {voting_f1:.4f}")
print(classification_report(y_test, voting_pred))


Training voting ensemble...

Voting Ensemble Results:
Accuracy: 0.4610, F1 Macro: 0.4534
              precision    recall  f1-score   support

           0       0.42      0.56      0.48        39
           1       0.56      0.23      0.33        60
           2       0.45      0.69      0.55        42

    accuracy                           0.46       141
   macro avg       0.48      0.50      0.45       141
weighted avg       0.49      0.46      0.44       141



In [None]:
# Advanced ensemble with bias-correction
print("Training advanced weighted ensemble...")
# Get predictions from each model
all_preds = np.array([
    base_models_trained['xgb'].predict_proba(X_test),
    base_models_trained['rf'].predict_proba(X_test),
    base_models_trained['gb'].predict_proba(X_test),
    base_models_trained['svc'].predict_proba(X_test),
    base_models_trained['mlp'].predict_proba(X_test)
])

# Use weighted average for final predictions
# Weights are dynamically adjusted based on individual model performances
individual_accuracies = [
    accuracy_score(y_test, base_predictions['xgb']),
    accuracy_score(y_test, base_predictions['rf']),
    accuracy_score(y_test, base_predictions['gb']),
    accuracy_score(y_test, base_predictions['svc']),
    accuracy_score(y_test, base_predictions['mlp'])
]
# Normalize weights
weights = np.array(individual_accuracies) / sum(individual_accuracies)
print("Model weights:", weights)


Training advanced weighted ensemble...
Model weights: [0.19692308 0.20615385 0.19384615 0.21538462 0.18769231]


In [None]:

# Calculate weighted average probabilities
weighted_probs = sum(weights[i] * all_preds[i] for i in range(len(all_preds)))
advanced_ensemble_pred = np.argmax(weighted_probs, axis=1)

advanced_accuracy = accuracy_score(y_test, advanced_ensemble_pred)
advanced_f1 = f1_score(y_test, advanced_ensemble_pred, average='macro')

print("\nAdvanced Weighted Ensemble Results:")
print(f"Accuracy: {advanced_accuracy:.4f}, F1 Macro: {advanced_f1:.4f}")

print(classification_report(y_test, advanced_ensemble_pred))



Advanced Weighted Ensemble Results:
Accuracy: 0.4539, F1 Macro: 0.4460
              precision    recall  f1-score   support

           0       0.41      0.54      0.47        39
           1       0.56      0.23      0.33        60
           2       0.45      0.69      0.54        42

    accuracy                           0.45       141
   macro avg       0.47      0.49      0.45       141
weighted avg       0.49      0.45      0.43       141



In [None]:
# Visualize confusion matrices
models = {
    'XGBoost': base_predictions['xgb'],
    'Random Forest': base_predictions['rf'],
    'Stacked Ensemble': stack_pred,
    'Voting Ensemble': voting_pred,
    'Advanced Weighted Ensemble': advanced_ensemble_pred
}

plt.figure(figsize=(20, 15))
for i, (name, preds) in enumerate(models.items(), 1):
    plt.subplot(2, 3, i)
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Low', 'Medium', 'High'],
                yticklabels=['Low', 'Medium', 'High'])
    plt.title(f'{name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

plt.tight_layout()
plt.savefig('confusion_matrices.png')
plt.close()

# Compare all results
print("\nFinal Accuracy Comparison:")
results = {
    'XGBoost': accuracy_score(y_test, base_predictions['xgb']),
    'Random Forest': accuracy_score(y_test, base_predictions['rf']),
    'Gradient Boosting': accuracy_score(y_test, base_predictions['gb']),
    'SVC': accuracy_score(y_test, base_predictions['svc']),
    'MLP': accuracy_score(y_test, base_predictions['mlp']),
    'Stacked Ensemble': stack_accuracy,
    'Voting Ensemble': voting_accuracy,
    'Advanced Weighted Ensemble': advanced_accuracy
}

for model, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{model}: {acc:.4f}")


Final Accuracy Comparison:
SVC: 0.4965
Random Forest: 0.4752
Stacked Ensemble: 0.4681
Voting Ensemble: 0.4610
XGBoost: 0.4539
Advanced Weighted Ensemble: 0.4539
Gradient Boosting: 0.4468
MLP: 0.4326


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv("proc_ds.csv", encoding="ISO-8859-1")
df = df.dropna(subset=['Grievance Description', 'Urgency Level'])
df["Urgency Level"] = df["Urgency Level"].str.lower()
df = df[df["Urgency Level"].isin(["low", "medium", "high"])]
urgency_mapping = {"low": 0, "medium": 1, "high": 2}
df["Urgency Level"] = df["Urgency Level"].map(urgency_mapping)
tfidf_vect = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=5000)
X = tfidf_vect.fit_transform(df["Grievance Description"]).toarray()
y = df["Urgency Level"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

base_models = [
    ('svc', SVC(kernel='linear', probability=True, C=1.0)),
    ('xgb', XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, use_label_encoder=False, eval_metric='mlogloss')),
    ('mlp', MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', max_iter=500))
]

meta_learner = LogisticRegression()
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_learner, passthrough=True)
stacked_model.fit(X_train, y_train)
y_pred = stacked_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.4752
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.33      0.41        39
           1       0.44      0.65      0.52        60
           2       0.54      0.36      0.43        42

    accuracy                           0.48       141
   macro avg       0.51      0.45      0.45       141
weighted avg       0.50      0.48      0.46       141



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Text preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into text
    processed_text = ' '.join(tokens)

    return processed_text

# Load and preprocess the data
df = pd.read_csv("proc_ds.csv", encoding="ISO-8859-1")
df = df.dropna(subset=['Grievance Description', 'Urgency Level'])
df["Urgency Level"] = df["Urgency Level"].str.lower()
df = df[df["Urgency Level"].isin(["low", "medium", "high"])]

# Add Category as a feature if it's available in your dataset
has_category = 'Category' in df.columns

# Preprocess the text
df["Processed_Description"] = df["Grievance Description"].apply(preprocess_text)

# Extract additional features
df['Description_Length'] = df['Grievance Description'].apply(len)
df['Word_Count'] = df['Grievance Description'].apply(lambda x: len(str(x).split()))

# Map urgency levels to numerical values
urgency_mapping = {"low": 0, "medium": 1, "high": 2}
df["Urgency Level"] = df["Urgency Level"].map(urgency_mapping)

# TF-IDF transformation with improved parameters
tfidf_vect = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 3),  # Include up to trigrams
    max_features=10000,  # Increase features
    min_df=2,           # Minimum document frequency
    max_df=0.9,         # Maximum document frequency
    sublinear_tf=True   # Apply sublinear tf scaling
)

# Prepare feature set
X_text = tfidf_vect.fit_transform(df["Processed_Description"])
additional_features = df[['Description_Length', 'Word_Count']].values

# Add Category as one-hot encoded features if available
if has_category:
    category_dummies = pd.get_dummies(df['Category'], prefix='category')
    additional_features = np.hstack((additional_features, category_dummies.values))

# Combine text features and additional features
X = np.hstack((X_text.toarray(), additional_features))
y = df["Urgency Level"]

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Set up base models with optimized hyperparameters
base_models = [
    ('svc', SVC(
        kernel='rbf',
        C=10.0,
        gamma='scale',
        probability=True,
        class_weight='balanced'
    )),
    ('xgb', XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.01,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=3,
        eval_metric='mlogloss',
        use_label_encoder=False
    )),
    ('rf', RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        bootstrap=True,
        class_weight='balanced'
    )),
    ('gb', GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        min_samples_split=4,
        min_samples_leaf=2,
        subsample=0.8
    )),
    ('mlp', MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        alpha=0.0001,
        batch_size=64,
        learning_rate='adaptive',
        max_iter=1000,
        early_stopping=True
    ))
]

# Set up meta learner with cross-validation
meta_learner = LogisticRegression(
    C=1.0,
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced'
)

# Create and train the stacked model with cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=cv,
    passthrough=True
)

# Fit the stacked model on the resampled training data
stacked_model.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate on the test set
y_pred = stacked_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", conf_matrix)
'''
# Function to predict urgency for new grievances
def predict_urgency(grievance_text, model=stacked_model):
    # Preprocess the text
    processed_text = preprocess_text(grievance_text)

    # Transform with TF-IDF
    text_features = tfidf_vect.transform([processed_text]).toarray()

    # Calculate additional features
    desc_length = len(grievance_text)
    word_count = len(grievance_text.split())
    add_features = np.array([[desc_length, word_count]])

    # Combine features
    features = np.hstack((text_features, add_features))

    # If category features were used, add placeholder zeros
    if has_category and additional_features.shape[1] > 2:
        category_zeros = np.zeros((1, additional_features.shape[1] - 2))
        features = np.hstack((features, category_zeros))

    # Predict
    urgency_num = model.predict(features)[0]

    # Map back to labels
    urgency_labels = {0: "low", 1: "medium", 2: "high"}
    return urgency_labels[urgency_num]'''

Accuracy: 0.4894
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.44      0.52        39
           1       0.45      0.62      0.52        60
           2       0.47      0.36      0.41        42

    accuracy                           0.49       141
   macro avg       0.52      0.47      0.48       141
weighted avg       0.51      0.49      0.49       141

Confusion Matrix:
 [[17 20  2]
 [ 8 37 15]
 [ 2 25 15]]


'\n# Function to predict urgency for new grievances\ndef predict_urgency(grievance_text, model=stacked_model):\n    # Preprocess the text\n    processed_text = preprocess_text(grievance_text)\n    \n    # Transform with TF-IDF\n    text_features = tfidf_vect.transform([processed_text]).toarray()\n    \n    # Calculate additional features\n    desc_length = len(grievance_text)\n    word_count = len(grievance_text.split())\n    add_features = np.array([[desc_length, word_count]])\n    \n    # Combine features\n    features = np.hstack((text_features, add_features))\n    \n    # If category features were used, add placeholder zeros\n    if has_category and additional_features.shape[1] > 2:\n        category_zeros = np.zeros((1, additional_features.shape[1] - 2))\n        features = np.hstack((features, category_zeros))\n    \n    # Predict\n    urgency_num = model.predict(features)[0]\n    \n    # Map back to labels\n    urgency_labels = {0: "low", 1: "medium", 2: "high"}\n    return urge

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

# Custom text preprocessing with memory efficiency in mind
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and short words (often not meaningful)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    # Stemming (less resource-intensive than lemmatization)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

# Feature extraction functions
def extract_features(df):
    # Add important text metadata features
    features = pd.DataFrame()

    # Text length features
    features['text_length'] = df['Grievance Description'].apply(lambda x: len(str(x)))

    # Word count
    features['word_count'] = df['Grievance Description'].apply(lambda x: len(str(x).split()))

    # Exclamation mark count (might indicate urgency)
    features['exclamation_count'] = df['Grievance Description'].apply(lambda x: str(x).count('!'))

    # Question mark count
    features['question_count'] = df['Grievance Description'].apply(lambda x: str(x).count('?'))

    # Uppercase word count (might indicate emphasis/urgency)
    features['uppercase_word_count'] = df['Grievance Description'].apply(
        lambda x: sum(1 for word in str(x).split() if word.isupper() and len(word) > 1)
    )

    # Category as feature if available
    if 'Category' in df.columns:
        features['Category'] = df['Category']

    return features

# Load and prepare the data with memory efficiency in mind
print("Loading and preprocessing data...")
df = pd.read_csv("proc_ds.csv", encoding="ISO-8859-1")
df = df.dropna(subset=['Grievance Description', 'Urgency Level'])
df["Urgency Level"] = df["Urgency Level"].str.lower()
df = df[df["Urgency Level"].isin(["low", "medium", "high"])]

# Map urgency levels to numerical values
urgency_mapping = {"low": 0, "medium": 1, "high": 2}
df["Urgency Level"] = df["Urgency Level"].map(urgency_mapping)

# Print class distribution
print("Class distribution before sampling:")
print(df["Urgency Level"].value_counts())

# Extract additional features
print("Extracting features...")
additional_features = extract_features(df)

# Apply text preprocessing
print("Preprocessing text...")
df['processed_text'] = df['Grievance Description'].apply(preprocess_text)

# Create category features if available
categorical_features = []
if 'Category' in additional_features.columns:
    categorical_features = ['Category']
    print(f"Using categories: {df['Category'].unique()}")

numeric_features = [col for col in additional_features.columns if col != 'Category']

# Use smaller TF-IDF parameters to reduce memory usage
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=2000,  # Reduced feature count
    min_df=2,
    max_df=0.85,
    ngram_range=(1, 2),
    sublinear_tf=True,
    use_idf=True,
    norm='l2'
)

# Split data with stratification
X_text = df['processed_text']
X_features = additional_features
y = df["Urgency Level"]

# Create preprocessing pipelines
print("Building model pipeline...")
text_pipeline = Pipeline([
    ('tfidf', tfidf)
])

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_pipeline, 'processed_text'),
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough'
)

if categorical_features:
    # Add categorical features
    cat_preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ('preprocess', preprocessor, ['processed_text'] + numeric_features),
            ('cat_encode', cat_preprocessor, categorical_features)
        ]
    )

# Use a memory-efficient model - Random Forest with limited depth and trees
rf_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_model)
])

# Prepare data for pipeline
X_combined = pd.concat([
    X_text.reset_index(drop=True),
    X_features.reset_index(drop=True)
], axis=1)

# Stratified sampling to handle class imbalance
# Undersample the majority classes instead of oversampling to save memory
class_counts = df["Urgency Level"].value_counts()
min_class_count = class_counts.min()
balanced_df = pd.DataFrame()

for class_val in df["Urgency Level"].unique():
    class_df = df[df["Urgency Level"] == class_val]
    if len(class_df) > min_class_count:
        # Undersample
        class_df = class_df.sample(min_class_count, random_state=42)
    balanced_df = pd.concat([balanced_df, class_df])

# Recreate features and text after balancing
balanced_X_text = balanced_df['processed_text']
balanced_X_features = extract_features(balanced_df)
balanced_y = balanced_df["Urgency Level"]

balanced_X_combined = pd.concat([
    balanced_X_text.reset_index(drop=True),
    balanced_X_features.reset_index(drop=True)
], axis=1)

# Verify class balance
print("Class distribution after balancing:")
print(balanced_y.value_counts())

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    balanced_X_combined, balanced_y,
    test_size=0.2,
    stratify=balanced_y,
    random_state=42
)

# Fit the model
print("Training model...")
pipeline.fit(X_train, y_train)

# Evaluate
print("Evaluating model...")
# Get cross-validation score first
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Predict on test set
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature importance analysis
if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
    importances = pipeline.named_steps['classifier'].feature_importances_
    print("\nTop important features:")
    if hasattr(pipeline.named_steps['preprocessor'], 'get_feature_names_out'):
        feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        print(importance_df.head(20))
'''
# Simple function to predict on new data
def predict_urgency(text, category=None):
    # Create a DataFrame with the same structure as training data
    data = pd.DataFrame({
        'processed_text': [preprocess_text(text)],
    })

    # Add text features
    data['text_length'] = [len(text)]
    data['word_count'] = [len(text.split())]
    data['exclamation_count'] = [text.count('!')]
    data['question_count'] = [text.count('?')]
    data['uppercase_word_count'] = [sum(1 for word in text.split() if word.isupper() and len(word) > 1)]

    # Add category if provided and was used in training
    if category is not None and 'Category' in additional_features.columns:
        data['Category'] = [category]

    # Make prediction
    prediction = pipeline.predict(data)[0]
    proba = pipeline.predict_proba(data)[0]

    # Map back to labels
    urgency_labels = {0: "low", 1: "medium", 2: "high"}

    return {
        'prediction': urgency_labels[prediction],
        'confidence': max(proba) * 100,
        'probabilities': {urgency_labels[i]: proba[i] * 100 for i in range(len(proba))}
    }

# Simple sentiment-based validation model to check our results
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

print("\nTraining simple validation model...")
count_vec = CountVectorizer(max_features=1000)
X_text_counts = count_vec.fit_transform(balanced_X_text)
simple_model = MultinomialNB()
simple_model.fit(X_text_counts, balanced_y)

# Check validation model accuracy
X_test_counts = count_vec.transform(X_test['processed_text'])
simple_pred = simple_model.predict(X_test_counts)
simple_accuracy = accuracy_score(y_test, simple_pred)
print(f"Simple model accuracy: {simple_accuracy:.4f}")

# Try to determine what type of texts are classified as what urgency level
print("\nAnalyzing patterns in urgency levels:")
for level in [0, 1, 2]:
    level_texts = df[df["Urgency Level"] == level]['processed_text']
    if len(level_texts) > 0:
        # Get most common words for this level
        level_counts = CountVectorizer(max_features=20).fit_transform(level_texts)
        level_words = CountVectorizer(max_features=20).fit(level_texts).get_feature_names_out()
        level_word_counts = level_counts.sum(axis=0).tolist()[0]
        level_word_freq = sorted(zip(level_words, level_word_counts), key=lambda x: x[1], reverse=True)

        urgency_labels = {0: "low", 1: "medium", 2: "high"}
        print(f"{urgency_labels[level]} urgency common words: {level_word_freq[:10]}")

# Save model for future use
import pickle
print("\nSaving model to 'grievance_model.pkl'")
with open('grievance_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
print("Model saved successfully")'''

Loading and preprocessing data...
Class distribution before sampling:
Urgency Level
1    298
2    209
0    196
Name: count, dtype: int64
Extracting features...
Preprocessing text...
Using categories: ['medical and health' 'maintenance' 'train operations' 'housekeeping'
 'catering' 'security' 'customer service' 'luggage']
Creating TF-IDF features...
Building model pipeline...
Class distribution after balancing:
Urgency Level
2    196
1    196
0    196
Name: count, dtype: int64
Training model...
Evaluating model...
Cross-validation accuracy: 0.4894 ± 0.0404
Test accuracy: 0.5339
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.77      0.65        39
           1       0.39      0.28      0.33        39
           2       0.61      0.55      0.58        40

    accuracy                           0.53       118
   macro avg       0.52      0.53      0.52       118
weighted avg       0.52      0.53      0.52       118

Confusion M

'\n# Simple function to predict on new data\ndef predict_urgency(text, category=None):\n    # Create a DataFrame with the same structure as training data\n    data = pd.DataFrame({\n        \'processed_text\': [preprocess_text(text)],\n    })\n    \n    # Add text features\n    data[\'text_length\'] = [len(text)]\n    data[\'word_count\'] = [len(text.split())]\n    data[\'exclamation_count\'] = [text.count(\'!\')]\n    data[\'question_count\'] = [text.count(\'?\')]\n    data[\'uppercase_word_count\'] = [sum(1 for word in text.split() if word.isupper() and len(word) > 1)]\n    \n    # Add category if provided and was used in training\n    if category is not None and \'Category\' in additional_features.columns:\n        data[\'Category\'] = [category]\n    \n    # Make prediction\n    prediction = pipeline.predict(data)[0]\n    proba = pipeline.predict_proba(data)[0]\n    \n    # Map back to labels\n    urgency_labels = {0: "low", 1: "medium", 2: "high"}\n    \n    return {\n        \'pr

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import warnings
import random
import string
from collections import Counter

warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

print("Loading and preparing data...")
df = pd.read_csv("proc_ds.csv", encoding="ISO-8859-1")
df = df.dropna(subset=['Grievance Description', 'Urgency Level'])
df["Urgency Level"] = df["Urgency Level"].str.lower()
df = df[df["Urgency Level"].isin(["low", "medium", "high"])]

label_encoder = LabelEncoder()
df["urgency_encoded"] = label_encoder.fit_transform(df["Urgency Level"])

print("Original class distribution:")
print(df["Urgency Level"].value_counts())

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

print("Preprocessing text...")
df['processed_text'] = df['Grievance Description'].apply(preprocess_text)

def extract_features(text_series):
    features = pd.DataFrame()
    features['text_length'] = text_series.apply(len)
    features['word_count'] = text_series.apply(lambda x: len(str(x).split()))
    features['avg_word_length'] = text_series.apply(
        lambda x: np.mean([len(word) for word in str(x).split()]) if len(str(x).split()) > 0 else 0
    )
    features['unique_word_ratio'] = text_series.apply(
        lambda x: len(set(str(x).split())) / (len(str(x).split()) or 1)
    )
    features['exclamation_count'] = text_series.apply(lambda x: str(x).count('!'))
    features['question_count'] = text_series.apply(lambda x: str(x).count('?'))
    features['uppercase_ratio'] = text_series.apply(
        lambda x: sum(1 for c in str(x) if c.isupper()) / (len(str(x)) or 1)
    )
    features['number_count'] = text_series.apply(lambda x: sum(c.isdigit() for c in str(x)))
    return features

print("Extracting features...")
text_features = extract_features(df['Grievance Description'])

if 'Category' in df.columns:
    print("Using category information...")
    category_dummies = pd.get_dummies(df['Category'], prefix='category')
    text_features = pd.concat([text_features, category_dummies], axis=1)

def augment_data(df, target_count_per_class=300):
    augmented_df = df.copy()
    for urgency_level in df["Urgency Level"].unique():
        class_df = df[df["Urgency Level"] == urgency_level]
        current_count = len(class_df)
        if current_count < target_count_per_class:
            samples_needed = target_count_per_class - current_count
            new_samples = []
            for i in range(samples_needed):
                sample = class_df.sample(1).iloc[0]
                description = sample['Grievance Description']
                words = description.split()
                if len(words) > 3:
                    idx = random.randint(0, len(words) - 1)
                    if words[idx].lower() in ['problem', 'issue']:
                        words[idx] = random.choice(['concern', 'trouble', 'matter'])
                    augmented_text = ' '.join(words)
                    new_sample = sample.copy()
                    new_sample['Grievance Description'] = augmented_text
                    new_samples.append(new_sample)
            augmented_df = pd.concat([augmented_df, pd.DataFrame(new_samples)], ignore_index=True)
    return augmented_df

print("Augmenting data...")
df = augment_data(df)

print("Preprocessing text after augmentation...")
df['processed_text'] = df['Grievance Description'].apply(preprocess_text)

print("Extracting features after augmentation...")
text_features = extract_features(df['Grievance Description'])

print("Vectorizing text...")
tfidf = TfidfVectorizer(max_features=5000)
text_vectors = tfidf.fit_transform(df['processed_text']).toarray()

# Ensure both text_vectors and text_features have the same number of rows
print(f"text_vectors shape: {text_vectors.shape}, text_features shape: {text_features.shape}")

X = np.hstack((text_vectors, text_features))
y = df['urgency_encoded']

'''
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Training models...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
nb = MultinomialNB()
lr = LogisticRegression(max_iter=200, random_state=42)
svm = LinearSVC(random_state=42)

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('nb', nb), ('lr', lr), ('svm', svm)], voting='hard'
)

ensemble.fit(X_train, y_train)'''

from sklearn.preprocessing import MinMaxScaler

print("Scaling features to non-negative range...")
scaler = MinMaxScaler()
X = scaler.fit_transform(X)  # Ensures all values are >= 0

print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

print("Training models...")
ensemble = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('nb', MultinomialNB()),  # Now works because X is non-negative
        ('lr', LogisticRegression(max_iter=500)),
        ('svm', LinearSVC())
    ],
    voting='hard'
)

ensemble.fit(X_train, y_train)


print("Evaluating model...")
y_pred = ensemble.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Loading and preparing data...
Original class distribution:
Urgency Level
medium    298
high      209
low       196
Name: count, dtype: int64
Preprocessing text...
Extracting features...
Using category information...
Augmenting data...
Preprocessing text after augmentation...
Extracting features after augmentation...
Vectorizing text...
text_vectors shape: (900, 2856), text_features shape: (900, 8)
Scaling features to non-negative range...
Applying SMOTE...
Training models...
Evaluating model...
Accuracy: 0.6500

Classification Report:
               precision    recall  f1-score   support

        high       0.66      0.75      0.70        60
         low       0.69      0.78      0.73        60
      medium       0.57      0.42      0.48        60

    accuracy                           0.65       180
   macro avg       0.64      0.65      0.64       180
weighted avg       0.64      0.65      0.64       180


Confusion Matrix:
 [[45  6  9]
 [ 3 47 10]
 [20 15 25]]


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from nltk.sentiment import SentimentIntensityAnalyzer
from imblearn.over_sampling import SMOTE
import nltk
import re
import warnings
import random
from collections import Counter

warnings.filterwarnings('ignore')

# Ensure NLTK resources are available
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

print("Loading and preparing data...")
df = pd.read_csv("proc_ds.csv", encoding="ISO-8859-1")
df = df.dropna(subset=['Grievance Description', 'Urgency Level'])
df["Urgency Level"] = df["Urgency Level"].str.lower()
df = df[df["Urgency Level"].isin(["low", "medium", "high"])]

# Encode labels
label_encoder = LabelEncoder()
df["urgency_encoded"] = label_encoder.fit_transform(df["Urgency Level"])

print("Original class distribution:")
print(df["Urgency Level"].value_counts())

# Text Preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    return text

print("Preprocessing text...")
df['processed_text'] = df['Grievance Description'].apply(preprocess_text)

# Feature Engineering
print("Extracting features...")
def extract_features(text_series):
    features = pd.DataFrame()
    features['text_length'] = text_series.apply(len)
    features['word_count'] = text_series.apply(lambda x: len(str(x).split()))
    features['exclamation_count'] = text_series.apply(lambda x: str(x).count('!'))
    return features

text_features = extract_features(df['Grievance Description'])

# Sentiment Score
sia = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['Grievance Description'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

# Add Sentiment Score to Features
text_features['sentiment_score'] = df['sentiment_score']

# Data Augmentation
def augment_data(df, target_count_per_class=300):
    augmented_df = df.copy()
    for urgency_level in df["Urgency Level"].unique():
        class_df = df[df["Urgency Level"] == urgency_level]
        current_count = len(class_df)
        if current_count < target_count_per_class:
            samples_needed = target_count_per_class - current_count
            new_samples = [class_df.sample(1, replace=True).iloc[0] for _ in range(samples_needed)]
            augmented_df = pd.concat([augmented_df, pd.DataFrame(new_samples)], ignore_index=True)
    return augmented_df


print("Augmenting data...")
df = augment_data(df)

print("Preprocessing text after augmentation...")
df['processed_text'] = df['Grievance Description'].apply(preprocess_text)

print("Extracting features after augmentation...")
text_features = extract_features(df['Grievance Description'])
text_features['sentiment_score'] = df['sentiment_score']

print("Vectorizing text with N-grams...")
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
text_vectors = tfidf.fit_transform(df['processed_text']).toarray()

# Feature Scaling
scaler = StandardScaler()
X_text_features = scaler.fit_transform(text_features)

# Merge All Features
X = np.hstack((text_vectors, X_text_features))
y = df['urgency_encoded']

print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Model Training
print("Training models...")
stacked_model = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5)),
        ('cat', CatBoostClassifier(iterations=200, depth=5, learning_rate=0.1, verbose=0)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
    ],
    final_estimator=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    cv=5
)

stacked_model.fit(X_train, y_train)

# Evaluation
print("Evaluating model...")
y_pred = stacked_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Loading and preparing data...
Original class distribution:
Urgency Level
medium    298
high      209
low       196
Name: count, dtype: int64
Preprocessing text...
Extracting features...
Augmenting data...
Preprocessing text after augmentation...
Extracting features after augmentation...
Vectorizing text with N-grams...
Applying SMOTE...
Training models...
Evaluating model...
Accuracy: 0.6722

Classification Report:
              precision    recall  f1-score   support

        high       0.83      0.57      0.67        60
         low       0.73      0.75      0.74        60
      medium       0.55      0.70      0.61        60

    accuracy                           0.67       180
   macro avg       0.70      0.67      0.67       180
weighted avg       0.70      0.67      0.67       180


Confusion Matrix:
[[34  5 21]
 [ 1 45 14]
 [ 6 12 42]]


In [None]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [None]:
import numpy as np
import pandas as pd
import nltk
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from textblob import TextBlob
from deep_translator import GoogleTranslator

nltk.download('wordnet')
from nltk.corpus import wordnet

# Function to paraphrase text by replacing synonyms
def paraphrase_text(text):
    words = text.split()
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            new_word = synonyms[0].lemmas()[0].name()
            new_words.append(new_word)
        else:
            new_words.append(word)
    return " ".join(new_words)

# Function for back-translation (English → French → English)
def back_translate(text):
    french_text = GoogleTranslator(source="auto", target="fr").translate(text)
    return GoogleTranslator(source="fr", target="en").translate(french_text)

# Data Augmentation
def augment_data(df, target_count_per_class=400):
    augmented_df = df.copy()
    for urgency_level in df["Urgency Level"].unique():
        class_df = df[df["Urgency Level"] == urgency_level]
        current_count = len(class_df)
        if current_count < target_count_per_class:
            samples_needed = target_count_per_class - current_count
            new_samples = []
            for _ in range(samples_needed):
                sample = class_df.sample(1, replace=True).iloc[0].copy()
                if random.random() > 0.5:
                    sample["Grievance Description"] = paraphrase_text(sample["Grievance Description"])
                else:
                    sample["Grievance Description"] = back_translate(sample["Grievance Description"])
                new_samples.append(sample)
            augmented_df = pd.concat([augmented_df, pd.DataFrame(new_samples)], ignore_index=True)
    return augmented_df

# Feature Extraction
def extract_features(df):
    df["text_length"] = df["Grievance Description"].apply(len)
    df["num_words"] = df["Grievance Description"].apply(lambda x: len(x.split()))
    df["sentiment"] = df["Grievance Description"].apply(lambda x: TextBlob(x).sentiment.polarity)
    return df

# Load dataset
df = pd.read_csv("proc_ds.csv", encoding="ISO-8859-1")

# Augment Data
df = augment_data(df)

# Feature Extraction
df = extract_features(df)

# Encode Labels
df["urgency_encoded"] = df["Urgency Level"].map({"low": 0, "medium": 1, "high": 2})

# Vectorize Text
tfidf = TfidfVectorizer(ngram_range=(1, 5), analyzer="char_wb")
text_vectors = tfidf.fit_transform(df["Grievance Description"]).toarray()

# Scale Features
scaler = StandardScaler()
text_features = scaler.fit_transform(df[["text_length", "num_words", "sentiment"]])

# Combine Features
X = np.hstack((text_vectors, text_features))
y = df["urgency_encoded"]

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define Models
base_models = [
    ("rf", RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),
    ("xgb", XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)),
]

# Stacking Classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=GradientBoostingClassifier(n_estimators=100))

# Train Model
stacking_model.fit(X_train, y_train)

# Evaluate Model
accuracy = stacking_model.score(X_test, y_test)
print(f"🚀 Final Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


🚀 Final Accuracy: 0.7292


In [None]:
import pickle

# Define filenames
model_filename = "stacking_model_prior_72.pkl"
tfidf_filename = "tfidf_vectorizer_prior_72.pkl"
scaler_filename = "scaler_prior_72.pkl"
encoder_filename = "urgency_encoder_prior_72.pkl"

# Save Stacking Model
with open(model_filename, "wb") as model_file:
    pickle.dump(stacking_model, model_file)

# Save TF-IDF Vectorizer
with open(tfidf_filename, "wb") as tfidf_file:
    pickle.dump(tfidf, tfidf_file)

# Save Standard Scaler
with open(scaler_filename, "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

# Save Label Encoder (Urgency Mapping)
urgency_mapping = {"low": 0, "medium": 1, "high": 2}
with open(encoder_filename, "wb") as encoder_file:
    pickle.dump(urgency_mapping, encoder_file)

print("✅ Model and necessary components saved successfully!")


✅ Model and necessary components saved successfully!
