In [49]:
# import libraries

import pandas as pd
import tweepy
import os
import json
from datetime import datetime, timedelta
import tweetnlp
import openai

### **Large Test**

In [50]:
# Load the original JSON file

with open("../../Data/Raw/tweets_ws23_v1.json", "r") as file:
        json_data = json.load(file)

def get_all_keys(obj):
    keys = set()
    if isinstance(obj, dict):
        keys.update(obj.keys())
        for value in obj.values():
            keys.update(get_all_keys(value))
    elif isinstance(obj, list):
        for item in obj:
            keys.update(get_all_keys(item))
    return keys

# Get all unique keys
all_keys = get_all_keys(json_data)

# Convert the set to a list to see all unique keys
all_keys_list = list(all_keys)

def flatten_json(y):
    """Flatten a nested JSON object"""
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

# Flatten the JSON data
flattened_data = [flatten_json(entry) for entry in json_data]

# Convert the list of flattened dictionaries to a DataFrame
df = pd.DataFrame(flattened_data)

# Show the DataFrame
df.head(3)

Unnamed: 0,source_created_at,author_id,text,source,language,longitude,latitude,id,labels_topic_0_tweet_id,labels_topic_0_user_id,labels_topic_0_relevant,labels_topic_0_topic,labels_topic_0_ground_truth,labels_topic_0_id,labels_sentiment_0_tweet_id,labels_sentiment_0_user_id,labels_sentiment_0_sentiment,labels_sentiment_0_ground_truth,labels_sentiment_0_id,source_id
0,2020-09-18 21:56:20.798000,593731316,@DSisourath The Thameslink core between London...,sprinklr,en,-0.12574,51.50853,acd7673f-e621-5f1a-d662-df278964a6ea,acd7673f-e621-5f1a-d662-df278964a6ea,Z003XDCS,True,service,True,8655353b-cc71-1e89-95d5-1c4e6bf200e1,acd7673f-e621-5f1a-d662-df278964a6ea,Z003XDCS,negative,True,5ee4d218-65a9-70a0-0964-a5b00e594a41,
1,2020-10-13 07:31:53.122000,745583289520496640,@DulwichHistory Loving the complaint about peo...,sprinklr,en,-0.12574,51.50853,5b92aba8-4b05-6c63-8485-e9c870742137,5b92aba8-4b05-6c63-8485-e9c870742137,Z003XDCS,True,delays,True,8aee5c86-6e93-24d4-6d9d-d8a9f2f19cef,5b92aba8-4b05-6c63-8485-e9c870742137,Z003XDCS,negative,True,2f8d741f-59cf-d1ab-27f9-5537a9718f94,
2,2020-10-26 19:27:24.695000,303134761,@SW_Help .And yet you have no toilets on some ...,sprinklr,en,-0.12574,51.50853,0a799c07-8b76-17ba-b840-e538d51e832d,0a799c07-8b76-17ba-b840-e538d51e832d,Z003XDCS,True,toilets,True,2d54f18a-442d-b59b-9daf-0490dc64d21a,0a799c07-8b76-17ba-b840-e538d51e832d,Z003XDCS,negative,True,be479c30-17c3-220f-8eae-eaf6fc874431,


In [51]:
# filter only the columns that we need

df_columns = df[['text','author_id','source_created_at']]
df_columns.head(3)

Unnamed: 0,text,author_id,source_created_at
0,@DSisourath The Thameslink core between London...,593731316,2020-09-18 21:56:20.798000
1,@DulwichHistory Loving the complaint about peo...,745583289520496640,2020-10-13 07:31:53.122000
2,@SW_Help .And yet you have no toilets on some ...,303134761,2020-10-26 19:27:24.695000


In [52]:
# filter out the tweets that contain the word 'delay' or 'updates' in them

df_filter = df_columns[~df_columns['text'].str.contains('delay', case=False, na=False)]
df_filter = df_filter[~df_filter['text'].str.contains('updates', case=False, na=False)]
df_filter

Unnamed: 0,text,author_id,source_created_at
0,@DSisourath The Thameslink core between London...,593731316,2020-09-18 21:56:20.798000
1,@DulwichHistory Loving the complaint about peo...,745583289520496640,2020-10-13 07:31:53.122000
2,@SW_Help .And yet you have no toilets on some ...,303134761,2020-10-26 19:27:24.695000
3,@SW_Help you have no toilets on some of your t...,303134761,2020-10-26 19:28:49.281000
4,@SpeedySticks007 @MrNeilJH @TLRailUK @christia...,56427671,2020-09-28 11:59:41.212000
...,...,...,...
16944,Haha oh man the audio corruption on @TLRailUK ...,24940900,2019-07-11 07:34:35
16945,@TLRailUK SweetIs there a plug to charge my ph...,63559912,2020-08-10 11:19:10.181000
16946,@TLRailUK now there are far fewer commuters ha...,277455263,2020-08-29 09:51:10.833000
16947,@geofftech I am voting for Thameslink. 1. in a...,3050957613,2020-11-02 12:06:06.967000


In [53]:
# add a column with the date and time

df_analysis = df_filter.copy()
df_analysis['date'] = df_analysis['source_created_at'].apply(lambda x: x.split(' ')[0])
df_analysis['time'] = df_analysis['source_created_at'].apply(lambda x: x.split(' ')[1].split('.')[0])
df_analysis.drop(['source_created_at'], axis=1, inplace=True)
df_analysis.head(3)

Unnamed: 0,text,author_id,date,time
0,@DSisourath The Thameslink core between London...,593731316,2020-09-18,21:56:20
1,@DulwichHistory Loving the complaint about peo...,745583289520496640,2020-10-13,07:31:53
2,@SW_Help .And yet you have no toilets on some ...,303134761,2020-10-26,19:27:24


In [54]:
# filter the latest 100 tweets according to the date and time

df_50 = df_analysis.sort_values(by=['date', 'time'], ascending=False).head(50)

In [55]:
# add a column with the sentiment of the tweet
# keep only the negative and neutral tweets

df_50['sentiment'] = df_50['text'].apply(lambda x: model.predict(x)['label'])
df_complaints = df_50[(df_50['sentiment'] == 'negative') | (df_50['sentiment'] == 'neutral')]
df_complaints.shape

(36, 5)

### **LLM Analysis**

In [58]:
from keys import OPENAI_API_KEY

OPENAI_API_KEY = OPENAI_API_KEY

In [47]:
# Constructing the prompt for OpenAI
prompt = """
As a senior train maintenance official, I am consolidating passenger complaints for efficient resolution. 
Below are various complaints from train passengers, each with a specific topic. 
I need a structured summary for each unique topic, combining similar complaints and providing actionable suggestions for resolution starting from the next day. 
Please group the complaints by their topics, summarize the key points, and suggest collective resolutions.
If the message is not a complaint, please ignore it.
End the reponse with the following line: "Please note that these suggestions are based on the provided information and may need further assessment and customization. Help us improve our service by providing feedback on your experience."



Complaints:
"""

complaint_summary = df_complaints.groupby(['text', 'date']).size().reset_index(name='count')

for index, row in complaint_summary.iterrows():
    prompt += f"- Date Created: {row['date']} - Complaint: {row['text']} (Count: {row['count']})\n"

prompt += """
For each complaint, provide:
1. Summary of the complaint in a very concise manner
2. Topic (1 or 2 words)
3. Suggestion for resolution with actionable steps starting from the next day

Please structure your response as follows:
- Date Created:
- Topic:
- Count of complaints for this topic:
- Summary:
- Suggestion:
"""

# Set up OpenAI API key
openai.api_key = OPENAI_API_KEY

# Generate response from OpenAI
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",  # Replace with the appropriate chat model
    messages=[
        {"role": "system", "content": "You are a senior train maintenance official."},
        {"role": "user", "content": prompt}
    ]
)

# Print the response
print(response.choices[0].message["content"])
data = response.choices[0].message["content"]

- Date Created: 2020-11-17
- Topic: Train Comfort
- Count of complaints for this topic: 1
- Summary: Passenger complains about a pole in the middle of the doors, particularly near wheelchair doors, on Thameslink trains.
- Suggestion: Conduct a review of the train design to address the issue of the pole obstructing wheelchair access. Consider relocating or redesigning the pole to improve accessibility for wheelchair users. Communicate the progress of the review to passengers through social media and customer service channels.

- Date Created: 2020-11-14
- Topic: Train Cleanliness and Maintenance
- Count of complaints for this topic: 4
- Summary: Passengers express dissatisfaction with dirty trains, uncleaned toilets, graffiti in toilets, and broken train doors.
- Suggestion: Implement a thorough cleaning and maintenance plan for trains. Increase the frequency of cleaning, especially in toilets, to ensure cleanliness and address graffiti issues promptly. Develop a system for reporting an

In [48]:
# Split the data into separate entries
entries = data.strip().split("- Date Created:")

# Initialize lists to hold the data
dates = []
topics = []
counts = []
summaries = []
suggestions = []

# Process each entry
for entry in entries:
    if entry:  # skip empty strings
        lines = entry.strip().split("\n- ")
        date, topic, count, summary, suggestion = lines
        dates.append(date.replace("Date Created: ", "").strip())
        topics.append(topic.replace("Topic: ", "").strip())
        counts.append(int(count.split(": ")[1]))
        summaries.append(summary.replace("Summary: ", "").strip())
        suggestions.append(suggestion.replace("Suggestion: ", "").strip())

# Create a DataFrame
df = pd.DataFrame({
    'Date of Complaint': dates,
    'Topic': topics,
    'Count': counts,
    'Summary': summaries,
    'Suggestion': suggestions
})

df.head()  # To display the first few rows of the DataFrame

Unnamed: 0,Date of Complaint,Topic,Count,Summary,Suggestion
0,2020-11-17,Train Comfort,1,Passenger complains about a pole in the middle...,Conduct a review of the train design to addres...
1,2020-11-14,Train Cleanliness and Maintenance,4,Passengers express dissatisfaction with dirty ...,Implement a thorough cleaning and maintenance ...
2,2020-11-18,Overcrowding and Seat Availability,1,Passenger notes that a service is busier than ...,Monitor passenger demand and adjust service fr...
3,2020-11-20,Train Appearance,1,Passenger criticizes the choice of a light-col...,Evaluate the train paint color and consider al...
4,2020-11-16,Train Timetable and Delays,6,Passengers raise concerns about inconsistent t...,Conduct a review of the train schedule and ana...
