##### Final Project AIT - 526

###### Vrishin Reddy Minkuri - G01444633
###### Rishitha Reddy Bitla - G01448537
###### Keerthana Reddy Kalva - G01448551 
###### Rohith Reddy Marlapally - G01450316

#### Base Line Modal Integration - 2

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import string
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>
[nltk_data] Error loading wordnet: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>


False

In [3]:
# Load the dataset from the specified path
df = pd.read_csv(r'C:\Users\vrishin\Documents\AIT_526_MP\Emotion_Text.csv')
print(df.head)

<bound method NDFrame.head of                                            Sentiment_Text  Sentiment_label
0       i feel awful about it too because it s my job ...                0
1                                   im alone i feel awful                0
2       ive probably mentioned this before but i reall...                1
3                i was feeling a little low few days back                0
4       i beleive that i am much more sensitive to oth...                2
...                                                   ...              ...
416804  that was what i felt when i was finally accept...                1
416805  i take every day as it comes i m just focussin...                4
416806      i just suddenly feel that everything was fake                0
416807  im feeling more eager than ever to claw back w...                1
416808  i give you plenty of attention even when i fee...                0

[416809 rows x 2 columns]>


In [None]:
df_cleaned = df[['Sentiment_Text', 'Sentiment_label']]
df_cleaned = df_cleaned[pd.notnull(df_cleaned['Sentiment_Text'])]

#### Text Cleaning and Preprocessing

In [None]:

def text_cleaning(df):
    """
    Cleans text data within a DataFrame by removing punctuation and converting text to lowercase.
    
    Args:
    - df (pd.DataFrame): DataFrame containing a column 'Sentiment_Text' with text to clean.

    Returns:
    - df (pd.DataFrame): DataFrame with an additional column 'cleaned_text' that contains the cleaned text data.
    """
    df['cleaned_text'] = ""
    for index, row in df.iterrows():
        text = row['Sentiment_Text'].lower()
        cleaned = [char for char in text if char not in string.punctuation]
        cleaned = "".join(cleaned)
        df.at[index, 'cleaned_text'] = cleaned
    return df

# Clean the dataset
df_cleaned = text_cleaning(df_cleaned)
df_cleaned = df_cleaned.reset_index(drop=True)

# Alternative save path to the Desktop
desktop_dir = os.path.join(os.path.expanduser("~"), "Desktop")
save_dir = os.path.join(desktop_dir, "model_saves")
os.makedirs(save_dir, exist_ok=True)
print(f"Models will be saved in: {save_dir}")

Models will be saved in: C:\Users\vrishin\Desktop\model_saves


#### Feature Extraction and Model Training (TF-IDF with ML Models)

In [None]:

def text_features(df, test_size):

    """
        Processes text data for sentiment analysis, trains multiple ML models, and saves them.
    
    Args:
    - df (pd.DataFrame): The input DataFrame containing at least two columns: 'cleaned_text' and 'Sentiment_label'.
    - test_size (float): The proportion of the dataset to include in the test split.

    Processes:
    - Splits the data into training and testing sets.
    - Fits a TF-IDF Vectorizer to the training data and transforms both training and test data.
    - Initializes and fits various machine learning models on the transformed training data.
    - Saves each model and the vectorizer to disk.

    Initializes the Following Models:
    - Naive Bayes
    - Support Vector Machine (SVM)
    - Logistic Regression
    - Lasso Regression
    - Decision Tree
    - Random Forest
    - K-Nearest Neighbors (KNN)
    - XGBoost

    Outputs:
    - models (dict): A dictionary of fitted models, keyed by model name.
    - tfidf_vect (TfidfVectorizer): The fitted TF-IDF vectorizer.
    - X_train (sparse matrix): The transformed training feature set.
    - X_test (sparse matrix): The transformed test feature set.
    - y_train (pd.Series): The training labels.
    - y_test (pd.Series): The test labels.

    Saves:
    - Individual model files for each classifier in a specified directory (defined by 'save_dir').
    - TF-IDF vectorizer file in the same directory.
    
    Returns:
    - A tuple containing references to the models dictionary, vectorizer, and dataset splits.
    """
    
    X = df['cleaned_text']
    y = df['Sentiment_label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # TF-IDF Vectorizer
    tfidf_vect = TfidfVectorizer(min_df=1, stop_words='english')
    tfidf_vect.fit(X_train)

    # Initialize Models
    models = {
        'Naive Bayes': MultinomialNB().fit(tfidf_vect.transform(X_train), y_train),
        'SVM': LinearSVC().fit(tfidf_vect.transform(X_train), y_train),
        'Logistic Regression': LogisticRegression(max_iter=1000).fit(tfidf_vect.transform(X_train), y_train),
        'Lasso Regression': LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000).fit(tfidf_vect.transform(X_train), y_train),
        'Decision Tree': DecisionTreeClassifier().fit(tfidf_vect.transform(X_train), y_train),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42).fit(tfidf_vect.transform(X_train), y_train),
        'KNN': KNeighborsClassifier(n_neighbors=5).fit(tfidf_vect.transform(X_train), y_train),
        'XGBoost': XGBClassifier(eval_metric='mlogloss').fit(tfidf_vect.transform(X_train), y_train)
    }

    # Save models and vectorizer
    for name, model in models.items():
        filename = os.path.join(save_dir, f"{name.lower().replace(' ', '_')}_text_emotion_model.sav")
        try:
            with open(filename, 'wb') as file:
                pickle.dump(model, file)
            print(f"{name} model saved successfully at {filename}")
        except OSError as e:
            print(f"Error saving {name} model: {e}")

    vectorizer_path = os.path.join(save_dir, 'tfidf_vect.pk')
    try:
        with open(vectorizer_path, 'wb') as fin:
            pickle.dump(tfidf_vect, fin)
        print("Vectorizer saved successfully.")
    except OSError as e:
        print(f"Error saving vectorizer: {e}")

    return models, tfidf_vect, X_train, X_test, y_train, y_test

#### Train traditional models

In [None]:

models, tfidf_vect, X_train, X_test, y_train, y_test = text_features(df_cleaned, test_size=0.2)

Naive Bayes model saved successfully at C:\Users\vrishin\Desktop\model_saves\naive_bayes_text_emotion_model.sav
SVM model saved successfully at C:\Users\vrishin\Desktop\model_saves\svm_text_emotion_model.sav
Logistic Regression model saved successfully at C:\Users\vrishin\Desktop\model_saves\logistic_regression_text_emotion_model.sav
Lasso Regression model saved successfully at C:\Users\vrishin\Desktop\model_saves\lasso_regression_text_emotion_model.sav
Decision Tree model saved successfully at C:\Users\vrishin\Desktop\model_saves\decision_tree_text_emotion_model.sav
Random Forest model saved successfully at C:\Users\vrishin\Desktop\model_saves\random_forest_text_emotion_model.sav
KNN model saved successfully at C:\Users\vrishin\Desktop\model_saves\knn_text_emotion_model.sav
XGBoost model saved successfully at C:\Users\vrishin\Desktop\model_saves\xgboost_text_emotion_model.sav
Vectorizer saved successfully.


#### Model Evaluation

In [None]:

results_list = []
for name, clf in models.items():
    y_pred = clf.predict(tfidf_vect.transform(X_test))
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_test, y_pred, target_names=['Anger', 'Fear', 'Joy', 'Love', 'Sadness', 'Surprise']))
    
    # Store each model's metrics
    results_list.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

Classification Report for Naive Bayes:

              precision    recall  f1-score   support

       Anger       0.77      0.94      0.85     24504
        Fear       0.71      0.97      0.82     28247
         Joy       0.95      0.25      0.40      6853
        Love       0.93      0.65      0.76     11339
     Sadness       0.90      0.51      0.66      9376
    Surprise       1.00      0.09      0.16      3043

    accuracy                           0.77     83362
   macro avg       0.88      0.57      0.61     83362
weighted avg       0.81      0.77      0.74     83362

Classification Report for SVM:

              precision    recall  f1-score   support

       Anger       0.93      0.93      0.93     24504
        Fear       0.90      0.91      0.91     28247
         Joy       0.75      0.75      0.75      6853
        Love       0.89      0.89      0.89     11339
     Sadness       0.83      0.83      0.83      9376
    Surprise       0.73      0.70      0.71      3043

    a

#### Convert results to DataFrame and calculate a composite score (average of all metrics)

In [None]:

results_fullset = pd.DataFrame(results_list)
results_fullset['Composite Score'] = results_fullset[['Accuracy', 'Precision', 'Recall', 'F1 Score']].mean(axis=1)

#### Find the best model based on the highest composite score

In [None]:

best_model_name = results_fullset.loc[results_fullset['Composite Score'].idxmax(), 'Model']
best_model = models[best_model_name]

#### Print the best model details

In [None]:

print("\nOverall Performance Metrics for Each Model:")
print(results_fullset)
print(f"\nBest Model Based on Composite Score: {best_model_name}")


Overall Performance Metrics for Each Model:
                 Model  Accuracy  Precision    Recall  F1 Score  \
0          Naive Bayes  0.773518   0.812169  0.773518  0.743647   
1                  SVM  0.883484   0.883251  0.883484  0.883340   
2  Logistic Regression  0.890202   0.889324  0.890202  0.889574   
3     Lasso Regression  0.902150   0.901597  0.902150  0.901580   
4        Decision Tree  0.840731   0.840513  0.840731  0.840555   
5        Random Forest  0.862563   0.862292  0.862563  0.862398   
6                  KNN  0.475528   0.596713  0.475528  0.490754   
7              XGBoost  0.891797   0.897965  0.891797  0.893315   

   Composite Score  
0         0.775713  
1         0.883390  
2         0.889825  
3         0.901869  
4         0.840632  
5         0.862454  
6         0.509631  
7         0.893719  

Best Model Based on Composite Score: Lasso Regression


#### Plot using Plotly

In [None]:

fig = px.bar(results_fullset, x='Model', y=['Accuracy', 'Precision', 'Recall', 'F1 Score'], title="Model Performance Metrics")
fig.update_layout(barmode='group', xaxis_title="Model", yaxis_title="Score")
fig.show()

#### Confusion Matrix and Heatmap for the Best Model using Plotly

In [None]:

y_pred_best = best_model.predict(tfidf_vect.transform(X_test))
cm = confusion_matrix(y_test, y_pred_best)
cm_fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=['Anger', 'Fear', 'Joy', 'Love', 'Sadness', 'Surprise'],
    y=['Anger', 'Fear', 'Joy', 'Love', 'Sadness', 'Surprise'],
    colorscale='Blues'
))
cm_fig.update_layout(
    title=f'Confusion Matrix - {best_model_name}',
    xaxis_title="Predicted Label",
    yaxis_title="True Label"
)
cm_fig.show()

#### Save results to a CSV file

In [None]:

results_fullset.to_csv(os.path.join(save_dir, "results.csv"), index=False)

#### Cross-validation of Naive Bayes classification


In [15]:
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

# Perform Cross-Validation for a single model
def cross_validate_model(model, model_name, tfidf_vect, X, y, cv=5):
    """
    Perform cross-validation on a single model.
    
    Args:
    - model (sklearn estimator): The model to evaluate.
    - model_name (str): Name of the model.
    - tfidf_vect (TfidfVectorizer): Fitted TF-IDF vectorizer.
    - X (pd.Series): Input text data.
    - y (pd.Series): Target labels.
    - cv (int): Number of folds for cross-validation.
    
    Returns:
    - result_df (pd.DataFrame): DataFrame with cross-validation metrics for the model.
    """
    X_tfidf = tfidf_vect.transform(X)
    print(f"Performing cross-validation for {model_name}...")
    
    # Calculate metrics
    accuracy_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='precision_weighted')
    recall_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='recall_weighted')
    f1_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='f1_weighted')
    
    # Prepare results
    result = {
        'Model': model_name,
        'Accuracy (CV)': np.mean(accuracy_scores),
        'Precision (CV)': np.mean(precision_scores),
        'Recall (CV)': np.mean(recall_scores),
        'F1 Score (CV)': np.mean(f1_scores),
        'Composite Score (CV)': np.mean([np.mean(accuracy_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)])
    }
    
    result_df = pd.DataFrame([result])
    return result_df


model_name = 'Naive Bayes'  
selected_model = models[model_name]  

# Perform cross-validation for the selected model
cv_result = cross_validate_model(selected_model, model_name, tfidf_vect, df_cleaned['cleaned_text'], df_cleaned['Sentiment_label'], cv=5)

# Display and save the result
print(cv_result)

# Save the cross-validation result to CSV
cv_result_file_path = os.path.join(save_dir, f"{model_name}_cross_validation_results.csv")
cv_result.to_csv(cv_result_file_path, index=False)
print(f"\nCross-validation results for {model_name} saved to {cv_result_file_path}.")


Performing cross-validation for Naive Bayes...
         Model  Accuracy (CV)  Precision (CV)  Recall (CV)  F1 Score (CV)  \
0  Naive Bayes       0.772289        0.810524     0.772289       0.742095   

   Composite Score (CV)  
0              0.774299  

Cross-validation results for Naive Bayes saved to C:\Users\vrishin\Desktop\model_saves\Naive Bayes_cross_validation_results.csv.


#### Cross Validation for SVM classification

In [18]:
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

# Perform Cross-Validation for a single model
def cross_validate_model(model, model_name, tfidf_vect, X, y, cv=5):
    """
    Perform cross-validation on a single model.
    
    Args:
    - model (sklearn estimator): The model to evaluate.
    - model_name (str): Name of the model.
    - tfidf_vect (TfidfVectorizer): Fitted TF-IDF vectorizer.
    - X (pd.Series): Input text data.
    - y (pd.Series): Target labels.
    - cv (int): Number of folds for cross-validation.
    
    Returns:
    - result_df (pd.DataFrame): DataFrame with cross-validation metrics for the model.
    """
    X_tfidf = tfidf_vect.transform(X)
    print(f"Performing cross-validation for {model_name}...")
    
    # Calculate metrics
    accuracy_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='precision_weighted')
    recall_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='recall_weighted')
    f1_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='f1_weighted')
    
    # Prepare results
    result = {
        'Model': model_name,
        'Accuracy (CV)': np.mean(accuracy_scores),
        'Precision (CV)': np.mean(precision_scores),
        'Recall (CV)': np.mean(recall_scores),
        'F1 Score (CV)': np.mean(f1_scores),
        'Composite Score (CV)': np.mean([np.mean(accuracy_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)])
    }
    
    result_df = pd.DataFrame([result])
    return result_df


model_name = 'SVM'  
selected_model = models[model_name]  

# Perform cross-validation for the selected model
cv_result = cross_validate_model(selected_model, model_name, tfidf_vect, df_cleaned['cleaned_text'], df_cleaned['Sentiment_label'], cv=5)

# Display and save the result
print(cv_result)

# Save the cross-validation result to CSV
cv_result_file_path = os.path.join(save_dir, f"{model_name}_cross_validation_results.csv")
cv_result.to_csv(cv_result_file_path, index=False)
print(f"\nCross-validation results for {model_name} saved to {cv_result_file_path}.")


Performing cross-validation for SVM...
  Model  Accuracy (CV)  Precision (CV)  Recall (CV)  F1 Score (CV)  \
0   SVM       0.883028        0.882887     0.883028       0.882928   

   Composite Score (CV)  
0              0.882968  

Cross-validation results for SVM saved to C:\Users\vrishin\Desktop\model_saves\SVM_cross_validation_results.csv.


#### Cross Validation for Logistic Regression Classifier

In [19]:
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

# Perform Cross-Validation for a single model
def cross_validate_model(model, model_name, tfidf_vect, X, y, cv=5):
    """
    Perform cross-validation on a single model.
    
    Args:
    - model (sklearn estimator): The model to evaluate.
    - model_name (str): Name of the model.
    - tfidf_vect (TfidfVectorizer): Fitted TF-IDF vectorizer.
    - X (pd.Series): Input text data.
    - y (pd.Series): Target labels.
    - cv (int): Number of folds for cross-validation.
    
    Returns:
    - result_df (pd.DataFrame): DataFrame with cross-validation metrics for the model.
    """
    X_tfidf = tfidf_vect.transform(X)
    print(f"Performing cross-validation for {model_name}...")
    
    # Calculate metrics
    accuracy_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='precision_weighted')
    recall_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='recall_weighted')
    f1_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='f1_weighted')
    
    # Prepare results
    result = {
        'Model': model_name,
        'Accuracy (CV)': np.mean(accuracy_scores),
        'Precision (CV)': np.mean(precision_scores),
        'Recall (CV)': np.mean(recall_scores),
        'F1 Score (CV)': np.mean(f1_scores),
        'Composite Score (CV)': np.mean([np.mean(accuracy_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)])
    }
    
    result_df = pd.DataFrame([result])
    return result_df


model_name = 'Logistic Regression'  
selected_model = models[model_name]  

# Perform cross-validation for the selected model
cv_result = cross_validate_model(selected_model, model_name, tfidf_vect, df_cleaned['cleaned_text'], df_cleaned['Sentiment_label'], cv=5)

# Display and save the result
print(cv_result)

# Save the cross-validation result to CSV
cv_result_file_path = os.path.join(save_dir, f"{model_name}_cross_validation_results.csv")
cv_result.to_csv(cv_result_file_path, index=False)
print(f"\nCross-validation results for {model_name} saved to {cv_result_file_path}.")


Performing cross-validation for Logistic Regression...
                 Model  Accuracy (CV)  Precision (CV)  Recall (CV)  \
0  Logistic Regression       0.891617        0.890707     0.891617   

   F1 Score (CV)  Composite Score (CV)  
0       0.890953              0.891224  

Cross-validation results for Logistic Regression saved to C:\Users\vrishin\Desktop\model_saves\Logistic Regression_cross_validation_results.csv.


#### Cross Validation for Lasso Regression Classifier

In [20]:
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

# Perform Cross-Validation for a single model
def cross_validate_model(model, model_name, tfidf_vect, X, y, cv=5):
    """
    Perform cross-validation on a single model.
    
    Args:
    - model (sklearn estimator): The model to evaluate.
    - model_name (str): Name of the model.
    - tfidf_vect (TfidfVectorizer): Fitted TF-IDF vectorizer.
    - X (pd.Series): Input text data.
    - y (pd.Series): Target labels.
    - cv (int): Number of folds for cross-validation.
    
    Returns:
    - result_df (pd.DataFrame): DataFrame with cross-validation metrics for the model.
    """
    X_tfidf = tfidf_vect.transform(X)
    print(f"Performing cross-validation for {model_name}...")
    
    # Calculate metrics
    accuracy_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='precision_weighted')
    recall_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='recall_weighted')
    f1_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='f1_weighted')
    
    # Prepare results
    result = {
        'Model': model_name,
        'Accuracy (CV)': np.mean(accuracy_scores),
        'Precision (CV)': np.mean(precision_scores),
        'Recall (CV)': np.mean(recall_scores),
        'F1 Score (CV)': np.mean(f1_scores),
        'Composite Score (CV)': np.mean([np.mean(accuracy_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)])
    }
    
    result_df = pd.DataFrame([result])
    return result_df


model_name = 'Lasso Regression'  
selected_model = models[model_name]  

# Perform cross-validation for the selected model
cv_result = cross_validate_model(selected_model, model_name, tfidf_vect, df_cleaned['cleaned_text'], df_cleaned['Sentiment_label'], cv=5)

# Display and save the result
print(cv_result)

# Save the cross-validation result to CSV
cv_result_file_path = os.path.join(save_dir, f"{model_name}_cross_validation_results.csv")
cv_result.to_csv(cv_result_file_path, index=False)
print(f"\nCross-validation results for {model_name} saved to {cv_result_file_path}.")


Performing cross-validation for Lasso Regression...
              Model  Accuracy (CV)  Precision (CV)  Recall (CV)  \
0  Lasso Regression       0.902706        0.902156     0.902706   

   F1 Score (CV)  Composite Score (CV)  
0       0.902162              0.902433  

Cross-validation results for Lasso Regression saved to C:\Users\vrishin\Desktop\model_saves\Lasso Regression_cross_validation_results.csv.


#### Cross Validation for Decision Tree Classifier

In [21]:
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

# Perform Cross-Validation for a single model
def cross_validate_model(model, model_name, tfidf_vect, X, y, cv=5):
    """
    Perform cross-validation on a single model.
    
    Args:
    - model (sklearn estimator): The model to evaluate.
    - model_name (str): Name of the model.
    - tfidf_vect (TfidfVectorizer): Fitted TF-IDF vectorizer.
    - X (pd.Series): Input text data.
    - y (pd.Series): Target labels.
    - cv (int): Number of folds for cross-validation.
    
    Returns:
    - result_df (pd.DataFrame): DataFrame with cross-validation metrics for the model.
    """
    X_tfidf = tfidf_vect.transform(X)
    print(f"Performing cross-validation for {model_name}...")
    
    # Calculate metrics
    accuracy_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='precision_weighted')
    recall_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='recall_weighted')
    f1_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='f1_weighted')
    
    # Prepare results
    result = {
        'Model': model_name,
        'Accuracy (CV)': np.mean(accuracy_scores),
        'Precision (CV)': np.mean(precision_scores),
        'Recall (CV)': np.mean(recall_scores),
        'F1 Score (CV)': np.mean(f1_scores),
        'Composite Score (CV)': np.mean([np.mean(accuracy_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)])
    }
    
    result_df = pd.DataFrame([result])
    return result_df


model_name = 'Decision Tree'  
selected_model = models[model_name]  

# Perform cross-validation for the selected model
cv_result = cross_validate_model(selected_model, model_name, tfidf_vect, df_cleaned['cleaned_text'], df_cleaned['Sentiment_label'], cv=5)

# Display and save the result
print(cv_result)

# Save the cross-validation result to CSV
cv_result_file_path = os.path.join(save_dir, f"{model_name}_cross_validation_results.csv")
cv_result.to_csv(cv_result_file_path, index=False)
print(f"\nCross-validation results for {model_name} saved to {cv_result_file_path}.")


Performing cross-validation for Decision Tree...
           Model  Accuracy (CV)  Precision (CV)  Recall (CV)  F1 Score (CV)  \
0  Decision Tree        0.83961        0.839309     0.839622       0.839435   

   Composite Score (CV)  
0              0.839494  

Cross-validation results for Decision Tree saved to C:\Users\vrishin\Desktop\model_saves\Decision Tree_cross_validation_results.csv.
