In [8]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [9]:
# model_development.py

import os
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Create a directory to save models and vectorizers (if it doesn't already exist)
os.makedirs("models", exist_ok=True)



In [10]:
# Define configurations for each cleaned dataset
datasets_config = {

    "IMDB-Dataset": {
        "path": "/content/drive/MyDrive/Project2/clean_data/IMDB-Dataset_cleaned.csv",
        "text_column": "clean_review",  # Assuming you saved the cleaned review column as 'clean_review'
        "target": "sentiment"
    },

    "socialmedia_sentiment": {
        "path": "/content/drive/MyDrive/Project2/clean_data/socialmedia_sentiment_cleaned.csv",
        "text_column": "Text",
        "target": "Sentiment"
    },
    "twitter_us_airline": {
        "path": "/content/drive/MyDrive/Project2/clean_data/twitter_us_airline_cleaned.csv",
        "text_column": "text",
        "target": "airline_sentiment"
    },
    "twitter_training": {
        "path": "/content/drive/MyDrive/Project2/clean_data/twitter_training_cleaned.csv",
        "text_column": "Text",
        "target": "Sentiment"
    },
    "consumer_complaints": {
        "path": "/content/drive/MyDrive/Project2/clean_data/consumer_complaints_cleaned.csv",
        "text_column": "consumer_complaint_narrative",  # Assuming this column exists after cleaning
        "target": "company_response_to_consumer"         # Adjust based on your labeling
    }
}

import pandas as pd

# Loop through each dataset configuration
for dataset_name, config in datasets_config.items():
    # Load the dataset using the path specified in the configuration
    df = pd.read_csv(config["path"])

    # Print the dataset name and the first few rows
    print(f"\nDataset: {dataset_name}")
    print(df.head())



Dataset: IMDB-Dataset
                                              review sentiment  \
0  I really liked this Summerslam due to the look...  positive   
1  Not many television shows appeal to quite as m...  positive   
2  The film quickly gets to a major chase scene w...  negative   
3  Jane Austen would definitely approve of this o...  positive   
4  Expectations were somewhat high for me when I ...  negative   

                                        clean_review  
0  i really liked this summerslam due to the look...  
1  not many television shows appeal to quite as m...  
2  the film quickly gets to a major chase scene w...  
3  jane austen would definitely approve of this o...  
4  expectations were somewhat high for me when i ...  

Dataset: socialmedia_sentiment
   Unnamed: 0.1                                         Text    Sentiment  \
0             0        enjoying a beautiful day at the park!   Positive     
1             1           traffic was terrible this morning.   N

In [12]:
import os
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Create a directory in Google Drive to save the models if it doesn't already exist
models_dir = "/content/drive/MyDrive/Project2/models"
os.makedirs(models_dir, exist_ok=True)

# Iterate over each dataset configuration
for dataset_name, config in datasets_config.items():
    print(f"\n\n=== Processing {dataset_name} ===")

    # Load the cleaned dataset
    df = pd.read_csv(config["path"])
    print(f"Loaded {dataset_name} with shape: {df.shape}")

    # Drop rows with missing text or target values
    df = df.dropna(subset=[config["text_column"], config["target"]])
    print(f"After dropping missing values, shape: {df.shape}")

    # Extract features (X) and labels (y)
    X = df[config["text_column"]]
    y = df[config["target"]]

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature extraction using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train a Logistic Regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    # Predict and evaluate the model
    y_pred = model.predict(X_test_tfidf)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Save the trained model and vectorizer using pickle into the models directory
    model_filename = os.path.join(models_dir, f"{dataset_name}_model.pkl")
    vectorizer_filename = os.path.join(models_dir, f"{dataset_name}_vectorizer.pkl")

    with open(model_filename, "wb") as f:
        pickle.dump(model, f)
    with open(vectorizer_filename, "wb") as f:
        pickle.dump(vectorizer, f)

    print(f"Saved model to {model_filename} and vectorizer to {vectorizer_filename}")

print("\nModel development for all datasets completed.")




=== Processing IMDB-Dataset ===
Loaded IMDB-Dataset with shape: (25000, 3)
After dropping missing values, shape: (25000, 3)
Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.86      0.87      2475
    positive       0.87      0.89      0.88      2525

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

Saved model to /content/drive/MyDrive/Project2/models/IMDB-Dataset_model.pkl and vectorizer to /content/drive/MyDrive/Project2/models/IMDB-Dataset_vectorizer.pkl


=== Processing socialmedia_sentiment ===
Loaded socialmedia_sentiment with shape: (732, 14)
After dropping missing values, shape: (732, 14)
Classification Report:
                        precision    recall  f1-score   support

         Acceptance          0.00      0.00      0.00         2
      Acceptance             0.00      0.00      0.00         0
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.93      0.88      1889
     neutral       0.67      0.53      0.59       580
    positive       0.80      0.63      0.70       459

    accuracy                           0.80      2928
   macro avg       0.77      0.70      0.72      2928
weighted avg       0.80      0.80      0.80      2928

Saved model to /content/drive/MyDrive/Project2/models/twitter_us_airline_model.pkl and vectorizer to /content/drive/MyDrive/Project2/models/twitter_us_airline_vectorizer.pkl


=== Processing twitter_training ===
Loaded twitter_training with shape: (74682, 4)
After dropping missing values, shape: (73824, 4)
Classification Report:
              precision    recall  f1-score   support

  irrelevant       0.66      0.53      0.59      2561
    negative       0.74      0.79      0.76      4504
     neutral       0.68      0.64      0.66      3582
    positive       0.68      0.74      0.71     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
