In [5]:
import pandas as pd

df = pd.read_csv("tweets.csv")

print("Shape:", df.shape)
print("\nColumns:", df.columns)

df = df[['text', 'airline_sentiment']]

df = df.dropna(subset=['text'])

# Create binary escalation risk label
df['high_risk'] = df['airline_sentiment'].apply(
    lambda x: 1 if x == 'negative' else 0
)

print("\nHigh Risk Distribution:")
print(df['high_risk'].value_counts())

Shape: (14640, 15)

Columns: Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='str')

High Risk Distribution:
high_risk
1    9178
0    5462
Name: count, dtype: int64


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import re
import string
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    return " ".join(words)

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

print(df[['text', 'clean_text']].head())

                                                text  \
0                @VirginAmerica What @dhepburn said.   
1  @VirginAmerica plus you've added commercials t...   
2  @VirginAmerica I didn't today... Must mean I n...   
3  @VirginAmerica it's really aggressive to blast...   
4  @VirginAmerica and it's a really big bad thing...   

                                          clean_text  
0                                               said  
1      plus youve added commercials experience tacky  
2       didnt today must mean need take another trip  
3  really aggressive blast obnoxious entertainmen...  
4                               really big bad thing  


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['high_risk']

vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)

Training shape: (11712, 5000)
Testing shape: (2928, 5000)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluation
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.83      0.69      0.75      1092
           1       0.83      0.91      0.87      1836

    accuracy                           0.83      2928
   macro avg       0.83      0.80      0.81      2928
weighted avg       0.83      0.83      0.83      2928

ROC-AUC Score: 0.9055145562498503


In [12]:
from sklearn.metrics import precision_recall_curve
import numpy as np

precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)

# thresholds is 1 element shorter
thresholds = np.append(thresholds, 1.0)

for p, r, t in zip(precisions, recalls, thresholds):
    if r >= 0.95:
        print("Threshold:", round(t, 3))
        print("Precision:", round(p, 3))
        print("Recall:", round(r, 3))
        break

Threshold: 0.001
Precision: 0.627
Recall: 1.0


In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

def calculate_cost(threshold):
    y_pred_custom = (y_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_custom).ravel()
    
    cost = (fn * 5000) + (fp * 500)
    return cost, fn, fp

thresholds_to_test = np.linspace(0.1, 0.9, 9)

results = []

for t in thresholds_to_test:
    cost, fn, fp = calculate_cost(t)
    results.append((t, cost, fn, fp))

cost_df = pd.DataFrame(results, columns=['Threshold', 'Total_Cost', 'False_Negatives', 'False_Positives'])

cost_df.sort_values("Total_Cost")

Unnamed: 0,Threshold,Total_Cost,False_Negatives,False_Positives
1,0.2,465500,10,831
0,0.1,489000,1,968
2,0.3,492000,30,684
3,0.4,626500,76,493
4,0.5,964000,159,338
5,0.6,1528500,285,207
6,0.7,2484000,485,118
7,0.8,3971500,789,53
8,0.9,6409500,1281,9
