<a href="https://colab.research.google.com/github/SachinduDilshan/AirlineRatingPredictor/blob/main/Airline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Step 1: Load the Dataset
# Assuming the file is named 'Tweets.csv'
df = pd.read_csv('sample_data/Tweets.csv')
df = df[["airline_sentiment", "text"]]

# Step 2: Preprocess Text
ps = PorterStemmer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Tokenize
    text = nltk.word_tokenize(text)
    # Remove stopwords and apply stemming
    y = []
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(ps.stem(i))
    return " ".join(y)

# Apply text cleaning
print("Cleaning text data...")
df['text_cleaned'] = df['text'].apply(clean_text)

# Step 3: Feature Extraction
print("Extracting features...")
# Create TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)

# Generate TF-IDF vectors
X = tfidf.fit_transform(df['text_cleaned']).toarray()
Y = df['airline_sentiment'].values

# Step 4: Train Models
print("Splitting dataset...")
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Train Multinomial Naive Bayes
print("\nTraining Multinomial Naive Bayes...")
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)

print("\nNaive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, nb_predictions))
print("\nClassification Report:")
print(classification_report(y_test, nb_predictions))

# Train Random Forest
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("\nClassification Report:")
print(classification_report(y_test, rf_predictions))

# Function to predict sentiment for new tweets
def predict_sentiment(new_text, model=rf_model):
    """
    Predict sentiment for new tweets

    Parameters:
    new_text (str or list): New tweet(s) to analyze
    model: Trained model to use (default: Random Forest)

    Returns:
    str or list: Predicted sentiment(s)
    """
    # Clean the text
    if isinstance(new_text, str):
        cleaned = clean_text(new_text)
        # Transform using the same vectorizer
        vector = tfidf.transform([cleaned]).toarray()
        return model.predict(vector)[0]
    else:
        cleaned = [clean_text(text) for text in new_text]
        vectors = tfidf.transform(cleaned).toarray()
        return model.predict(vectors)

# Example usage of prediction function
example_tweets = [
    "This airline is terrible, worst experience ever!",
    "Great service and comfortable flight, thank you!",
    "Flight was okay, nothing special"
]

print("\nExample Predictions:")
predictions = predict_sentiment(example_tweets)
for tweet, pred in zip(example_tweets, predictions):
    print(f"Tweet: {tweet}")
    print(f"Predicted Sentiment: {pred}\n")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Cleaning text data...
Extracting features...
Splitting dataset...

Training Multinomial Naive Bayes...

Naive Bayes Results:
Accuracy: 0.7226775956284153

Classification Report:
              precision    recall  f1-score   support

    negative       0.72      0.98      0.83      1791
     neutral       0.72      0.29      0.41       648
    positive       0.80      0.36      0.50       489

    accuracy                           0.72      2928
   macro avg       0.75      0.54      0.58      2928
weighted avg       0.73      0.72      0.68      2928


Training Random Forest...

Random Forest Results:
Accuracy: 0.7459016393442623

Classification Report:
              precision    recall  f1-score   support

    negative       0.77      0.93      0.85      1791
     neutral       0.62      0.41      0.49       648
    positive       0.72      0.50      0.59       489

    accuracy                           0.75      2928
   macro avg       0.70      0.62      0.64      2928
weighted av