# building a Logistic machine model using the twitter sentiments datasets

In [29]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
df = pd.read_csv(r"C:\Users\tumel\OneDrive\Desktop\Twitter US Airline Sentiment\Tweets.csv")

# Features (X) and Target (y)
X = df['text']   # tweets
y = df['airline_sentiment']   # sentiment labels

# Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Predictions
y_pred = model.predict(X_test_vec)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7824453551912568

Classification Report:
               precision    recall  f1-score   support

    negative       0.81      0.94      0.87      1835
     neutral       0.64      0.48      0.55       620
    positive       0.82      0.58      0.68       473

    accuracy                           0.78      2928
   macro avg       0.76      0.67      0.70      2928
weighted avg       0.77      0.78      0.77      2928



# building Linear Model

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Load dataset
df = pd.read_csv(r"C:\Users\tumel\OneDrive\Desktop\Twitter US Airline Sentiment\Tweets.csv")

# Feature engineering
df['tweet_length'] = df['text'].apply(len)
df['num_hashtags'] = df['text'].str.count('#')
df['num_mentions'] = df['text'].str.count('@')

# Input (X) and Output (y)
X_text = df['text']
y = df['airline_sentiment']

# Train-test split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train_text)
X_test_vec = vectorizer.transform(X_test_text)

# Add numeric features
X_train_num = df.loc[X_train_text.index, ['tweet_length', 'num_hashtags', 'num_mentions']].values
X_test_num = df.loc[X_test_text.index, ['tweet_length', 'num_hashtags', 'num_mentions']].values

# Normalize numeric features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)

# Combine text + numeric features
X_train_combined = hstack([X_train_vec, X_train_num])
X_test_combined = hstack([X_test_vec, X_test_num])

# Train Linear Support Vector Classifier
model = LinearSVC()
model.fit(X_train_combined, y_train)

# Predictions
y_pred = model.predict(X_test_combined)
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7752732240437158
Classification Report:
               precision    recall  f1-score   support

    negative       0.83      0.89      0.86      1835
     neutral       0.59      0.55      0.57       620
    positive       0.75      0.62      0.68       473

    accuracy                           0.78      2928
   macro avg       0.73      0.69      0.70      2928
weighted avg       0.77      0.78      0.77      2928



# building a KNN using the twitter sentiments dataset

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Load dataset
df = pd.read_csv(r"C:\Users\tumel\OneDrive\Desktop\Twitter US Airline Sentiment\Tweets.csv")

# Feature engineering
df['tweet_length'] = df['text'].apply(len)
df['num_hashtags'] = df['text'].str.count('#')
df['num_mentions'] = df['text'].str.count('@')

# X = tweet text + meta features
X_text = df['text']
y = df['airline_sentiment']

# Train-test split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train_text)
X_test_vec = vectorizer.transform(X_test_text)

# Numeric features
X_train_num = df.loc[X_train_text.index, ['tweet_length','num_hashtags','num_mentions']].values
X_test_num = df.loc[X_test_text.index, ['tweet_length','num_hashtags','num_mentions']].values

# Scale numeric features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)

# Combine TF-IDF + numeric features
X_train_combined = hstack([X_train_vec, X_train_num])
X_test_combined = hstack([X_test_vec, X_test_num])

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_train_combined, y_train)

# Predictions
y_pred = knn.predict(X_test_combined)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7103825136612022

Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.89      0.82      1835
     neutral       0.55      0.41      0.47       620
    positive       0.57      0.41      0.48       473

    accuracy                           0.71      2928
   macro avg       0.63      0.57      0.59      2928
weighted avg       0.69      0.71      0.69      2928



#RandomForestRegression Model

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Load dataset
df = pd.read_csv(r"C:\Users\tumel\OneDrive\Desktop\Twitter US Airline Sentiment\Tweets.csv")

# Feature engineering
df['tweet_length'] = df['text'].apply(len)
df['num_hashtags'] = df['text'].str.count('#')
df['num_mentions'] = df['text'].str.count('@')

# Define features (X) and target (y)
X_text = df['text']
y = df['airline_sentiment']

# Split dataset
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train_text)
X_test_vec = vectorizer.transform(X_test_text)

# Numeric features
X_train_num = df.loc[X_train_text.index, ['tweet_length','num_hashtags','num_mentions']].values
X_test_num = df.loc[X_test_text.index, ['tweet_length','num_hashtags','num_mentions']].values

# Scale numeric features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)

# Combine text + numeric features
X_train_combined = hstack([X_train_vec, X_train_num])
X_test_combined = hstack([X_test_vec, X_test_num])

# Train Random Forest
rf = RandomForestClassifier(
    n_estimators=200,       # number of trees
    max_depth=20,          # limit tree depth
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_combined, y_train)

# Predictions
y_pred = rf.predict(X_test_combined)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6598360655737705

Classification Report:
               precision    recall  f1-score   support

    negative       0.65      1.00      0.79      1835
     neutral       0.71      0.06      0.11       620
    positive       0.88      0.14      0.25       473

    accuracy                           0.66      2928
   macro avg       0.75      0.40      0.38      2928
weighted avg       0.70      0.66      0.56      2928

