In [1]:
!pip -q install pandas numpy scikit-learn nltk matplotlib

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression


In [2]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("vader_lexicon")

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [3]:
df = pd.read_excel("2024 Comment Report.xlsx")
df.head()


Unnamed: 0,Ship Class,COMMENTS
0,Breakaway Class,Overall cruise expetience was Good. Food in th...
1,Breakaway Class,The ship is showing its age and should have be...
2,Breakaway Class,"did not like that therwas no general seating ,"
3,Breakaway Class,My balcony ststeroom was gross. It smelled dus...
4,Breakaway Class,Because I had a smell in my room. And all the ...


In [4]:
df.columns


Index(['Ship Class', 'COMMENTS'], dtype='object')

In [6]:
import pandas as pd

df = df.copy()

df = df.rename(columns={
    "Ship Class": "class",
    "COMMENTS": "comment_text"
})

df[["class", "comment_text"]].head()


Unnamed: 0,class,comment_text
0,Breakaway Class,Overall cruise expetience was Good. Food in th...
1,Breakaway Class,The ship is showing its age and should have be...
2,Breakaway Class,"did not like that therwas no general seating ,"
3,Breakaway Class,My balcony ststeroom was gross. It smelled dus...
4,Breakaway Class,Because I had a smell in my room. And all the ...


In [7]:
# Make sure text is string
df["comment_text"] = df["comment_text"].astype(str)

# Remove empty / whitespace-only comments
df = df[df["comment_text"].str.strip().ne("")]

# Drop duplicate comments
df = df.drop_duplicates(subset=["comment_text"])

print("Rows after cleaning:", len(df))
df.head()


Rows after cleaning: 411694


Unnamed: 0,class,comment_text
0,Breakaway Class,Overall cruise expetience was Good. Food in th...
1,Breakaway Class,The ship is showing its age and should have be...
2,Breakaway Class,"did not like that therwas no general seating ,"
3,Breakaway Class,My balcony ststeroom was gross. It smelled dus...
4,Breakaway Class,Because I had a smell in my room. And all the ...


In [8]:
import re

def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)      # remove URLs
    s = re.sub(r"[^a-z0-9\s']", " ", s)         # keep letters/numbers/apostrophes
    s = re.sub(r"\s+", " ", s).strip()          # normalize spaces
    return s

df["clean_text"] = df["comment_text"].apply(clean_text)

df[["comment_text", "clean_text"]].head(5)


Unnamed: 0,comment_text,clean_text
0,Overall cruise expetience was Good. Food in th...,overall cruise expetience was good food in the...
1,The ship is showing its age and should have be...,the ship is showing its age and should have be...
2,"did not like that therwas no general seating ,",did not like that therwas no general seating
3,My balcony ststeroom was gross. It smelled dus...,my balcony ststeroom was gross it smelled dust...
4,Because I had a smell in my room. And all the ...,because i had a smell in my room and all the t...


In [10]:
import nltk
nltk.download("vader_lexicon")

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

df["sentiment"] = df["clean_text"].apply(lambda x: sia.polarity_scores(x)["compound"])

df[["clean_text", "sentiment"]].head(5)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


KeyboardInterrupt: 

In [11]:
URGENT_KEYWORDS = [
    "refund","compensation","charged","billing","cancel","unsafe","safety","danger",
    "medical","hospital","sick","food poisoning","allergy",
    "manager","supervisor","complaint","report","harassment","assault","abuse"
]

def urgent_hits(text):
    return sum(1 for w in URGENT_KEYWORDS if w in text)

df["urgent_hits"] = df["clean_text"].apply(urgent_hits)

df[["clean_text", "urgent_hits"]].head(5)


Unnamed: 0,clean_text,urgent_hits
0,overall cruise expetience was good food in the...,0
1,the ship is showing its age and should have be...,0
2,did not like that therwas no general seating,0
3,my balcony ststeroom was gross it smelled dust...,1
4,because i had a smell in my room and all the t...,1


In [12]:
INTENSIFIERS = [
    "worst","terrible","horrible","awful","disgusting",
    "unacceptable","never","ruined","hate"
]

def intensity_hits(text):
    return sum(1 for w in INTENSIFIERS if w in text)

df["intense_hits"] = df["clean_text"].apply(intensity_hits)

df[["clean_text", "intense_hits"]].head(5)


Unnamed: 0,clean_text,intense_hits
0,overall cruise expetience was good food in the...,0
1,the ship is showing its age and should have be...,2
2,did not like that therwas no general seating,0
3,my balcony ststeroom was gross it smelled dust...,1
4,because i had a smell in my room and all the t...,0


In [13]:
df["word_count"] = df["clean_text"].apply(lambda x: len(x.split()))
df["has_number"] = df["clean_text"].str.contains(r"\d").astype(int)

df[["clean_text", "word_count", "has_number"]].head(5)


Unnamed: 0,clean_text,word_count,has_number
0,overall cruise expetience was good food in the...,43,0
1,the ship is showing its age and should have be...,159,0
2,did not like that therwas no general seating,8,0
3,my balcony ststeroom was gross it smelled dust...,270,1
4,because i had a smell in my room and all the t...,50,0


In [14]:
df.columns


Index(['class', 'comment_text', 'clean_text', 'urgent_hits', 'intense_hits',
       'word_count', 'has_number'],
      dtype='object')

In [16]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

# 1) Sentiment in one pass (faster)
sent = df["clean_text"].apply(sia.polarity_scores)
sent_df = pd.DataFrame(list(sent))  # columns: neg, neu, pos, compound
sent_df = sent_df.rename(columns={
    "compound": "sent_compound",
    "neg": "sent_neg",
    "pos": "sent_pos",
    "neu": "sent_neu"
})

In [17]:
# 2) Other features (lightweight)
df["word_count"] = df["clean_text"].str.split().str.len()
df["exclamations"] = df["comment_text"].str.count("!")
df["has_number"] = df["comment_text"].str.contains(r"\d", regex=True).astype(int)
df["negation_hits"] = df["clean_text"].str.count(r"\b(not|never|no)\b")

# 3) Keyword hits (fast enough; small sets)
df["urgent_hits"] = df["clean_text"].apply(lambda t: sum(k in t for k in URGENT_KEYWORDS))
df["intense_hits"] = df["clean_text"].apply(lambda t: sum(k in t for k in INTENSIFIERS))

# 4) Merge sentiment back
df = pd.concat([df.reset_index(drop=True), sent_df.reset_index(drop=True)], axis=1)

df[["sent_compound","urgent_hits","intense_hits","word_count","has_number"]].head()

Unnamed: 0,sent_compound,urgent_hits,intense_hits,word_count,has_number
0,0.7709,0,0,43,0
1,-0.9007,0,2,159,0
2,-0.5122,0,0,8,0
3,0.9724,1,1,270,1
4,-0.6765,1,0,50,0


In [18]:
df.columns


Index(['class', 'comment_text', 'clean_text', 'urgent_hits', 'intense_hits',
       'word_count', 'has_number', 'exclamations', 'negation_hits', 'sent_neg',
       'sent_neu', 'sent_pos', 'sent_compound'],
      dtype='object')

In [19]:
def compute_severity(row):
    # 1) Negative sentiment intensity (0..1) -> up to 45 points
    neg_intensity = max(0, -row["sent_compound"])     # 0 to 1
    score = 45 * neg_intensity

    # 2) Urgency / escalation cues -> up to 30 points
    score += 10 * min(row["urgent_hits"], 3)          # cap at 3 => +30

    # 3) Strong negative language -> up to 15 points
    score += 5 * min(row["intense_hits"], 3)          # cap at 3 => +15

    # 4) Specificity (more detailed complaints are often more actionable) -> up to 10 points
    score += 0.05 * min(row["word_count"], 200)       # cap words => +10

    # Bound 0..100
    return float(max(0, min(100, score)))

df["severity_score"] = df.apply(compute_severity, axis=1)
df["severity_score"].describe()


Unnamed: 0,severity_score
count,411694.0
mean,13.695666
std,16.90728
min,0.0
25%,1.6
50%,5.4
75%,20.0535
max,99.8965


In [20]:
df["severity_band"] = pd.cut(
    df["severity_score"],
    bins=[-1, 25, 50, 75, 100],
    labels=["Low", "Medium", "High", "Critical"]
)

df["severity_band"].value_counts(normalize=True).mul(100).round(2)


Unnamed: 0_level_0,proportion
severity_band,Unnamed: 1_level_1
Low,79.25
Medium,15.68
High,4.62
Critical,0.45


In [22]:
df["escalation_risk"] = df["severity_band"].isin(["High", "Critical"]).astype(int)

df["escalation_risk"].value_counts(normalize=True).mul(100).round(2)


Unnamed: 0_level_0,proportion
escalation_risk,Unnamed: 1_level_1
0,94.93
1,5.07


In [23]:
from sklearn.model_selection import train_test_split

FEATURES = [
    "sent_compound","sent_neg","sent_pos",
    "urgent_hits","intense_hits",
    "word_count","has_number","exclamations","negation_hits"
]

X = df[FEATURES]
y = df["escalation_risk"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

len(X_train), len(X_test)


(308770, 102924)

In [24]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=2000, class_weight="balanced")
model.fit(X_train, y_train)

proba = model.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)


In [25]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, pred))
print("ROC-AUC:", roc_auc_score(y_test, proba))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99     97707
           1       0.83      1.00      0.91      5217

    accuracy                           0.99    102924
   macro avg       0.92      0.99      0.95    102924
weighted avg       0.99      0.99      0.99    102924

ROC-AUC: 0.9993476386319601


In [26]:
import pandas as pd

coef = pd.Series(
    model.coef_[0],
    index=FEATURES
).sort_values(ascending=False)

coef


Unnamed: 0,0
urgent_hits,9.415231
intense_hits,4.915118
sent_neg,3.254186
has_number,0.093551
word_count,0.044107
negation_hits,0.005244
exclamations,0.000483
sent_pos,-4.441444
sent_compound,-38.25263


In [27]:
# Top 20 highest escalation-risk comments
top_risk = df.sort_values("severity_score", ascending=False).head(20)

top_risk_view = top_risk[
    ["class","severity_score","urgent_hits","intense_hits","sent_compound","word_count","comment_text"]
].copy()

# Sanitize comment display
top_risk_view["comment_text"] = top_risk_view["comment_text"].str.slice(0, 200) + "..."

top_risk_view


Unnamed: 0,class,severity_score,urgent_hits,intense_hits,sent_compound,word_count,comment_text
238921,Dawn Class,99.8965,4,3,-0.9977,584,"As mentioned in my review answers, I am VERY d..."
63791,Dawn Class,99.847,3,4,-0.9966,283,I have cruised for over 40 years on many diffe...
400685,Epic Class,99.838,4,4,-0.9964,714,"Upon boarding, I discovered when I looked at m..."
386509,Dawn Class,99.8335,5,4,-0.9963,403,For many people this was supposed to be a once...
42424,Breakaway Class,99.82,4,6,-0.996,372,We were sick almost the entire time. We picked...
204462,Breakaway Class,99.811,3,3,-0.9958,357,If you look at my Prior comments regarding sho...
1939,Jewel Class,99.8065,4,3,-0.9957,589,where to begin..first embark process: there we...
260749,PoA Class,99.793,4,3,-0.9954,690,RESTAURANTS. (1) Service was mediocre to awful...
144213,Breakaway Class,99.739,3,3,-0.9942,708,"Our overall experience was good, but not great..."
354677,Jewel Class,99.7075,4,5,-0.9935,784,Upon being allowed in our room we were greeted...


In [28]:
severity_summary = (
    df.groupby(["class", "severity_band"])
      .size()
      .reset_index(name="count")
)

severity_summary["pct"] = (
    severity_summary
    .groupby("class")["count"]
    .transform(lambda x: x / x.sum() * 100)
)

severity_summary.sort_values(["class","severity_band"])


  df.groupby(["class", "severity_band"])


Unnamed: 0,class,severity_band,count,pct
0,Breakaway Class,Low,150011,80.783108
1,Breakaway Class,Medium,27196,14.645442
2,Breakaway Class,High,7741,4.168641
3,Breakaway Class,Critical,748,0.402809
4,Dawn Class,Low,16856,66.611342
5,Dawn Class,Medium,5720,22.604228
6,Dawn Class,High,2411,9.527761
7,Dawn Class,Critical,318,1.256669
8,Epic Class,Low,24280,73.53342
9,Epic Class,Medium,6469,19.59175


In [29]:
df_export = df[
    ["class","severity_score","severity_band",
     "urgent_hits","intense_hits","sent_compound","word_count"]
]

df_export.to_csv("severity_scored_comments_summary.csv", index=False)
severity_summary.to_csv("severity_distribution_by_class.csv", index=False)
