### **Import Libraries**

In [45]:
# import libraries

import pandas as pd
import tweepy
import json
import tweetnlp
import openai

# import functions

import sys
sys.path.append('../Functions/')  # Adjust the relative path as needed

from json_transform import get_all_keys, flatten_json

### **Load and Transform the Data**

In [46]:
''' 
The JSON data is nested, so we need to flatten it to be able to convert it to a DataFrame.
We will use the get_all_keys() and flatten_json() functions from the json_transform.py file.
After getting all the keys, we will convert them to a list and then flatten the JSON data.
Convert the flattened data to a DataFrame and filter out the rows that contain "update" in the text column.
'''

with open("../../Data/Raw/tweets_ws23_v1.json", "r") as file:
        json_data = json.load(file)

all_keys = get_all_keys(json_data)
all_keys_list = list(all_keys)
flattened_data = [flatten_json(entry) for entry in json_data]
all_tweets = pd.DataFrame(flattened_data)

# filter out the rows that contain "update" in the text column

all_tweets = all_tweets[~all_tweets['text'].str.contains("update")]
number_total_tweets = all_tweets.shape[0]

In [47]:
'''
We will now drop the columns that we don't need and rename the columns that we will use.
Then split the source_created_at column into date and time for easier analysis.
Finally, we will drop the duplicated tweets.
'''

transformed_tweets = all_tweets[['text', 'author_id', 'source_created_at']].copy()
print(f'Total number of tweets: {transformed_tweets.shape[0]}')

transformed_tweets['date'] = transformed_tweets['source_created_at'].apply(lambda x: x.split(' ')[0])
transformed_tweets['time'] = transformed_tweets['source_created_at'].apply(lambda x: x.split(' ')[1].split('.')[0])
transformed_tweets.drop(['source_created_at'], axis=1, inplace=True)

transformed_tweets = transformed_tweets.drop_duplicates(subset=['text'])

print(f'Total number of tweets after dropping duplicates: {transformed_tweets.shape[0]}')

Total number of tweets: 15824
Total number of tweets after dropping duplicates: 14683


In [48]:
'''Group the tweets by date and create a dictionary with the date as the key and the dataframe as the value.
To access the dataframe for a specific date, use the date_dfs dictionary and the date as the key.
Example: date_dfs['2021-05-01']'''

grouped_tweets = transformed_tweets.groupby('date')
date_dfs = {date: group for date, group in grouped_tweets}

### **Filter the Data**

In [49]:
selected_date = '2019-09-29'

number_selected = date_dfs[selected_date].shape[0]

# Check if the selected date is in the date_dfs dictionary
if selected_date in date_dfs:
    # You can now work with selected_df, which is the DataFrame for the selected date
    print(f"Date {selected_date} is valid and has {date_dfs[selected_date].shape[0]} tweets.")
else:
    print(f"Date {selected_date} is not available in the data.")

Date 2019-09-29 is valid and has 74 tweets.


In [50]:
# filter out the tweets that contain the word "delay"
df_filter_delay = date_dfs[selected_date][~date_dfs[selected_date]['text'].str.contains('delay', case=False, na=False)]
number_delay_tweets = number_selected - df_filter_delay.shape[0]

print(f'Number of tweets after filtering out the word "delay": {df_filter_delay.shape[0]}')

Number of tweets after filtering out the word "delay": 24


In [51]:
# Check for irony in the tweets using the tweetnlp library and save the results in a new column

model_irony = tweetnlp.load_model('irony')
df_filter_irony = df_filter_delay.copy()
df_filter_irony['irony'] = df_filter_irony['text'].apply(lambda x: model_irony.predict(x)['label'])

irony_counts = df_filter_irony['irony'].value_counts()
number_irony_tweets = irony_counts.loc['irony'] if 'irony' in irony_counts else 0

print(f'Number of ironic tweets: {number_irony_tweets}')

Number of ironic tweets: 5


In [52]:
# Add a column with the sentiment of the tweet and filter out the tweets that are not negative.

model = tweetnlp.load_model('sentiment')
df_filter_sentiment = df_filter_irony.copy()

df_filter_sentiment['sentiment'] = df_filter_sentiment['text'].apply(lambda x: model.predict(x)['label'])

# filter out the tweets that are not negative and not ironic

df_filter_sentiment = df_filter_sentiment[(df_filter_sentiment['irony'] == 'irony') | (df_filter_sentiment['sentiment'] == 'negative')]

number_complaints = df_filter_sentiment.shape[0]
print(f'Number of tweets to be fed to ChatGPT: {number_complaints}')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Number of tweets to be fed to ChatGPT: 15


### **Prepare Data for LLM**

In [53]:
day_tweets = df_filter_sentiment[['text', 'date', 'irony']].reset_index(drop=True)

### **LLM Analysis**

In [54]:
from keys import OPENAI_API_KEY

OPENAI_API_KEY = OPENAI_API_KEY

In [71]:
# Constructing the prompt for OpenAI
prompt = """
As a senior train maintenance official, I am consolidating passenger complaints for efficient resolution. 
Below are various tweets from train passengers. 
As our customer's opinion is very important to us, analyze each message. 
Categorize each message as a complaint or not a complaint.
Complaints should be further categorized by one of these topics: Train Conditions, Staff Conduct and Service, Station Facilities, Safety and Security, Onboard Amenities, Communication and Information, Accessibility, Other.
When the category is 'Other', please write what the complaint is about.
If there is no complaint for a certain category, ignore that category.
Please note that some tweets are marked with 'irony'. Pay special attention to these and interpret them carefully.
Ignore the tweets that are not complaints.

Complaints:
"""

complaint_summary = day_tweets.groupby(['text', 'date', 'irony']).size().reset_index(name='count')

for index, row in complaint_summary.iterrows():
    irony_tag = "Irony" if row['irony'] == 'irony' else "Non-Irony"
    prompt += f"- Date Created: {row['date']} - Complaint: {row['text']} - {irony_tag} (Count: {row['count']})\n"

prompt += """
For each complaint, provide the response in a structured format. Each element should be on a separate line with no additional spaces or lines in between:
- Date Created:
- Topic:
- Count of complaints for this topic:
- Summary:
- Suggestion:
Please maintain this format consistently throughout your response. Note: Even if the topic is "Other", count of complaints for this topic
"""

# Set up OpenAI API key
openai.api_key = OPENAI_API_KEY

# Generate response from OpenAI
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-1106",  # Replace with the appropriate chat model
    seed=42,
    messages=[
        {"role": "system", "content": "You are a senior train maintenance official."},
        {"role": "user", "content": prompt}
    ]
)

# Print the response
print(response.choices[0].message["content"])
data = response.choices[0].message["content"]

- Date Created: 2019-09-29
- Topic: Staff Conduct and Service
- Count of complaints for this topic: 3
- Summary: Passengers complained about lack of information from the staff, unapologetic announcements, and a driver not opening the doors at the right station.
- Suggestion: Provide comprehensive and timely information to passengers, deliver announcements with empathy, and ensure proper coordination between staff members to avoid such incidents.

- Date Created: 2019-09-29
- Topic: Train Conditions
- Count of complaints for this topic: 1
- Summary: Passengers expressed dissatisfaction with the temperature inside the train and being left without a functioning toilet for over an hour.
- Suggestion: Ensure that the train maintains a comfortable temperature and promptly address any issues with onboard facilities to improve passenger experience.

- Date Created: 2019-09-29
- Topic: Communication and Information
- Count of complaints for this topic: 4
- Summary: Passengers highlighted inadeq

In [72]:
# Split the data into separate entries
entries = data.strip().split("- Date Created:")

# Initialize lists to hold the data
dates = []
topics = []
counts = []
summaries = []
suggestions = []

# Process each entry
for entry in entries:
    if entry:  # skip empty strings
        lines = entry.strip().split("\n- ")
        date, topic, count, summary, suggestion = lines
        dates.append(date.replace("Date Created: ", "").strip())
        topics.append(topic.replace("Topic: ", "").strip())
        counts.append(int(count.split(": ")[1]))
        summaries.append(summary.replace("Summary: ", "").strip())
        suggestions.append(suggestion.replace("Suggestion: ", "").strip())

# Create a DataFrame
df = pd.DataFrame({
    'Date of Complaint': dates,
    'Topic': topics,
    'Count': counts,
    'Summary': summaries,
    'Suggestion': suggestions
})

df  # To display the first few rows of the DataFrame

Unnamed: 0,Date of Complaint,Topic,Count,Summary,Suggestion
0,2019-09-29,Staff Conduct and Service,3,Passengers complained about lack of informatio...,Provide comprehensive and timely information t...
1,2019-09-29,Train Conditions,1,Passengers expressed dissatisfaction with the ...,Ensure that the train maintains a comfortable ...
2,2019-09-29,Communication and Information,4,Passengers highlighted inadequate communicatio...,"Enhance communication channels, provide accura..."
3,2019-09-29,Safety and Security,1,A passenger reported being stranded on a train...,"Ensure that clear, timely, and accurate safety..."
4,2019-09-29,Accessibility,1,A passenger expressed frustration regarding th...,Ensure that trains are accessible and make arr...
5,2019-09-29,Other,2,Complaints such as the inconvenience caused by...,Address specific issues related to the cancell...


In [73]:
# create a summary dataframe with number_selected, number_delay_tweets, number_irony_tweets, number_complaints

summary_df = pd.DataFrame({'Date': selected_date, 'Total_Tweets': number_selected, 'Delay_Tweets': number_delay_tweets, 'Ironic_Tweets': number_irony_tweets, 'Tweets2ChatGPT': number_complaints}, index=[0])
summary_df

Unnamed: 0,Date,Total_Tweets,Delay_Tweets,Ironic_Tweets,Tweets2ChatGPT
0,2019-09-29,74,50,5,15


In [74]:
# export the dataframes to csv files

df.to_csv(f'../../Data/Processed/Sprint8 - ouput - Erjon.csv', index=False)
summary_df.to_csv(f'../../Data/Processed/Sprint8 - summary - Erjon.csv', index=False)