### **Import Libraries**

In [1]:
# Importing libraries

import pandas as pd
import tweepy
import json
import tweetnlp
import openai

# Importing functions
import sys
sys.path.append('../Functions/')  # Adjust the relative path as needed

from json_transform import get_all_keys, flatten_json

2023-11-25 21:54:05,243	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-11-25 21:54:05,393	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


### **Load and Transform the Data**

In [3]:
''' 
sample_tweets_complete - Erjon.csv file is used as input.
This is done to compare the result with the survey.

'''
all_tweets = pd.read_csv("../../Data/Processed/Sprint4 - sample_tweets_complete - Erjon.csv")

In [4]:
# Printing tweet length
print("There are {} tweets in the dataset".format(len(all_tweets)))

There are 100 tweets in the dataset


In [5]:
# Filtering out the rows that contain "update" in the text column

all_tweets = all_tweets[~all_tweets['text'].str.contains("update")]
number_total_tweets = all_tweets.shape[0]

# Printing tweet length after filtering out the rows that contain "update" in the text column

print("There are " + str(number_total_tweets) + " tweets left after filtering out the rows that contain 'update' in the text column.")


There are 93 tweets left after filtering out the rows that contain 'update' in the text column.


In [6]:
'''
We will now drop the columns that we don't need and rename the columns that we will use.
Then split the source_created_at column into date and time for easier analysis.
Finally, we will drop the duplicated tweets.
'''

transformed_tweets = all_tweets[['text', 'author_id', 'source_created_at']].copy()
print(f'Total number of tweets: {transformed_tweets.shape[0]}')

transformed_tweets['date'] = transformed_tweets['source_created_at'].apply(lambda x: x.split(' ')[0])
transformed_tweets['time'] = transformed_tweets['source_created_at'].apply(lambda x: x.split(' ')[1].split('.')[0])
transformed_tweets.drop(['source_created_at'], axis=1, inplace=True)

transformed_tweets = transformed_tweets.drop_duplicates(subset=['text'])

print(f'Total number of tweets after dropping duplicates: {transformed_tweets.shape[0]}')

Total number of tweets: 93
Total number of tweets after dropping duplicates: 93


### **Filter the Data**

In [7]:
# Checking transform_tweets dataframe and remove text that contains "delay" in the text column
transformed_tweets = transformed_tweets[~transformed_tweets['text'].str.contains("delay")]
number_total_tweets = transformed_tweets.shape[0]

# Printing tweet length after filtering out the rows that contain "update" in the text column
print("There are " + str(number_total_tweets) + " tweets left after filtering out the rows that contain 'delay' in the text column.")

There are 54 tweets left after filtering out the rows that contain 'delay' in the text column.


In [8]:
# Checking for irony in the tweets using the tweetnlp library and save the results in a new column called irony
model_irony = tweetnlp.load_model('irony')
transformed_tweets_irony = transformed_tweets.copy()
transformed_tweets_irony['irony'] = transformed_tweets_irony['text'].apply(lambda x: model_irony.predict(x)['label'])

irony_counts = transformed_tweets_irony['irony'].value_counts()
number_irony_tweets = irony_counts.loc['irony'] if 'irony' in irony_counts else 0

print(f'Number of ironic tweets: {number_irony_tweets}')

Number of ironic tweets: 10


In [9]:
# Printing rows if irony column is equals to irony
ironic_tweets = transformed_tweets_irony[transformed_tweets_irony['irony'] == 'irony']
print(ironic_tweets)

                                                 text     author_id  \
0   @TLRailUK You opened the doors 20 seconds afte...  2.008622e+07   
23  @TLRailUK 0930 at ECR (on late 0807 BTN to LBG...  4.686315e+08   
34  @TLRailUK it seems remarkable that the train i...  5.324844e+08   
46  Thanks... for nothing! @TLRailUK No paper in t...  1.101208e+18   
53  Delightful @TLRailUK train this morning - fill...  2.503814e+09   
54  Oh great, looks like @TLRailUK on-train WiFi d...  7.696183e+07   
68  Govia Thameslink fined £1m over Gatwick Expres...  1.089150e+18   
80  The seats on @TLRailUK are made to give you ba...  7.695022e+08   
83  Commuting public: "hurrah, the school holidays...  1.155034e+08   
88  Taking the Biscuit?! Sweet treat apology from ...  5.351682e+08   

          date      time  irony  
0   2019-01-16  18:45:31  irony  
23  2019-08-14  08:33:01  irony  
34  2019-01-22  07:58:09  irony  
46  2019-05-29  18:30:26  irony  
53  2020-01-23  07:00:43  irony  
54  2019-10-09

In [10]:
# Adding a column with the sentiment of the tweet and filter out the tweets that are not negative.
model = tweetnlp.load_model('sentiment')
transformed_tweets_sentiment = transformed_tweets_irony.copy()

transformed_tweets_sentiment['sentiment'] = transformed_tweets_sentiment['text'].apply(lambda x: model.predict(x)['label'])

# Filtering out the tweets that are not negative and not ironic
transformed_tweets_sentiment = transformed_tweets_sentiment[(transformed_tweets_sentiment['irony'] == 'irony') | (transformed_tweets_sentiment['sentiment'] == 'negative')]

number_complaints = transformed_tweets_sentiment.shape[0]
print(f'Number of tweets to be fed to ChatGPT: {number_complaints}')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Number of tweets to be fed to ChatGPT: 33


                                                            PLEASE EVALUATE UNTIL HERE. DO NOT CONTINUE.

### **Prepare Data for LLM**

In [14]:
day_tweets = df_filter_sentiment[['text', 'date', 'irony']].reset_index(drop=True)

### **LLM Analysis**

In [11]:
from keys import OPENAI_API_KEY

OPENAI_API_KEY = OPENAI_API_KEY

In [12]:
# Constructing the prompt for OpenAI
prompt = """
As a senior train maintenance official, I am consolidating passenger complaints for efficient resolution. 
Below are various tweets from train passengers. 
As our customer's opinion is very important to us, analyze each message. 
Categorize each message as a complaint or not a complaint.
Complaints should be further categorized by one of these topics: Train Conditions, Staff Conduct and Service, Station Facilities, Safety and Security, Onboard Amenities, Communication and Information, Accessibility, Other.
When the category is 'Other', please write what the complaint is about.
If there is no complaint for a certain category, ignore that category.
Please note that some tweets are marked with 'irony'. Pay special attention to these and interpret them carefully.
Ignore the tweets that are not complaints.

Complaints:
"""

complaint_summary = day_tweets.groupby(['text', 'date', 'irony']).size().reset_index(name='count')

for index, row in complaint_summary.iterrows():
    irony_tag = "Irony" if row['irony'] == 'irony' else "Non-Irony"
    prompt += f"- Date Created: {row['date']} - Complaint: {row['text']} - {irony_tag} (Count: {row['count']})\n"

prompt += """
For each complaint, provide:
1. Summary of the complaint in a very concise manner
2. Topic (1 or 2 words)
3. Suggestion for resolution with actionable steps starting from the next day

Please structure your response as follows:
- Date Created:
- Topic:
- Count of complaints for this topic:
- Summary:
- Suggestion:
"""

# Set up OpenAI API key
openai.api_key = OPENAI_API_KEY

# Generate response from OpenAI
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",  # Replace with the appropriate chat model
    messages=[
        {"role": "system", "content": "You are a senior train maintenance official."},
        {"role": "user", "content": prompt}
    ]
)

# Print the response
print(response.choices[0].message["content"])
data = response.choices[0].message["content"]

- Date Created: 2019-09-29
- Topic: Irony
- Count of complaints for this topic: 4
- Summary: Passengers are expressing complaints sarcastically or ironically.
- Suggestion: It is important to acknowledge the sarcasm or irony in these complaints and reply with a light-hearted tone to show understanding and good humor.

- Date Created: 2019-09-29
- Topic: Train Conditions
- Count of complaints for this topic: 1
- Summary: Passengers are experiencing uncomfortable train conditions, such as low temperature and lack of toilet facilities.
- Suggestion: Ensure that trains are maintained at appropriate temperatures and that all facilities, including toilets, are in working order. Promptly address any malfunctions and provide alternate options if necessary.

- Date Created: 2019-09-29
- Topic: Staff Conduct and Service
- Count of complaints for this topic: 3
- Summary: Passengers are dissatisfied with the lack of information and announcements from staff during disruptions or incidents.
- Sugges

In [13]:
# Split the data into separate entries
entries = data.strip().split("- Date Created:")

# Initialize lists to hold the data
dates = []
topics = []
counts = []
summaries = []
suggestions = []

# Process each entry
for entry in entries:
    if entry:  # skip empty strings
        lines = entry.strip().split("\n- ")
        date, topic, count, summary, suggestion = lines
        dates.append(date.replace("Date Created: ", "").strip())
        topics.append(topic.replace("Topic: ", "").strip())
        counts.append(int(count.split(": ")[1]))
        summaries.append(summary.replace("Summary: ", "").strip())
        suggestions.append(suggestion.replace("Suggestion: ", "").strip())

# Create a DataFrame
df = pd.DataFrame({
    'Date of Complaint': dates,
    'Topic': topics,
    'Count': counts,
    'Summary': summaries,
    'Suggestion': suggestions
})

df  # To display the first few rows of the DataFrame

Unnamed: 0,Date of Complaint,Topic,Count,Summary,Suggestion
0,2019-09-29,Irony,4,Passengers are expressing complaints sarcastic...,It is important to acknowledge the sarcasm or ...
1,2019-09-29,Train Conditions,1,Passengers are experiencing uncomfortable trai...,Ensure that trains are maintained at appropria...
2,2019-09-29,Staff Conduct and Service,3,Passengers are dissatisfied with the lack of i...,Improve communication with passengers by provi...
3,2019-09-29,Onboard Amenities,1,Passengers are frustrated when train doors are...,Ensure that train doors are consistently opene...
4,2019-09-29,Communication and Information,4,Passengers are expressing frustration with the...,Improve communication channels by ensuring tha...
5,2019-09-29,Other,1,Passengers are frustrated with the lack of coo...,Strengthen coordination between Eurostar and T...


In [15]:
# create a summary dataframe with number_selected, number_delay_tweets, number_irony_tweets, number_complaints

summary_df = pd.DataFrame({'Total number': number_selected, '"delay" tweets': number_delay_tweets, 'Ironic tweets': number_irony_tweets, 'Tweets to be fed to ChatGPT': number_complaints}, index=[0])
summary_df

Unnamed: 0,Total number,"""delay"" tweets",Ironic tweets,Tweets to be fed to ChatGPT
0,74,50,5,15
