In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns

In [None]:
df = pd.read_csv('../input/Tweets.csv')

In [None]:
df.head()

In [None]:
def clean_df(df):
    df = df.loc[: , ['airline_sentiment', 'airline_sentiment_confidence','negativereason','negativereason_confidence',              'name','text','tweet_coord','tweet_created','airline']].rename(columns = {'airline_sentiment':'Rating',airline_sentiment_confidence':'Rating_Conf','negativereason':'Negative_Reason','negativereason_confidence':'Reason_Conf','name':'User','text':'Text','tweet_coord':'Coordinates','tweet_created':'Date'}).set_index('Date')
    return df
clean_df(df).head(10)

In [None]:
print(clean_df(df).groupby('airline')['Rating'].count())

In [None]:
ax = clean_df(df).groupby(['airline','Rating']).count().iloc[:,0].unstack(0).plot(kind = 'bar', title = 'Airline Ratings via Twitter')
ax.set_xlabel('Rating')
ax.set_ylabel('Rating Count')
plt.show()

In [None]:
itemized_tweets = clean_df(df).groupby(['airline','Rating']).count().iloc[:,0]
total_tweets = clean_df(df).groupby(['airline'])['Rating'].count()

In [None]:
my_dict = {'American':itemized_tweets[0] / total_tweets[0],
           'Delta':itemized_tweets[3] / total_tweets[1],
           'Southwest': itemized_tweets[6] / total_tweets[2],
           'US Airways': itemized_tweets[9] / total_tweets[3],
           'United': itemized_tweets[12] / total_tweets[4],
           'Virgin': itemized_tweets[15] / total_tweets[5]}

perc_negative = pd.DataFrame.from_dict(my_dict, orient = 'index')
perc_negative.columns = ['Percent Negative']
print(perc_negative)
ax = perc_negative.plot(kind = 'bar', rot=0, colormap = 'Blues_r', figsize = (15,6))
ax.set_xlabel('Airlines')
ax.set_ylabel('Percent Negative')
plt.show()

In [None]:
itemized_tweets = clean_df(df).groupby(['airline','Rating']).count().iloc[:,0]
#Positve tweet total index for each airline:

total_tweets = clean_df(df).groupby(['airline'])['Rating'].count()
#Airline index in total tweets:

#Create a dictionary of percentage of positive tweets = (positive_tweets / total_tweets)
my_dict = {'American':itemized_tweets[2] / total_tweets[0],
           'Delta':itemized_tweets[5] / total_tweets[1],
           'Southwest': itemized_tweets[8] / total_tweets[2],
           'US Airways': itemized_tweets[11] / total_tweets[3],
           'United': itemized_tweets[14] / total_tweets[4],
           'Virgin': itemized_tweets[17] / total_tweets[5]}

#make a dataframe from the dictionary
perc_positive = pd.DataFrame.from_dict(my_dict, orient = 'index')
#have to manually set column name when using .from_dict() method
perc_positive.columns = ['Percent Positive']
print(perc_positive)
ax = perc_positive.plot(kind = 'bar', rot=0, colormap = 'Blues_r', figsize = (15,6))
ax.set_xlabel('Airlines')
ax.set_ylabel('Percent Positve')
plt.show()

In [None]:
def merge_dfs(x,y,z):
    #generate a list of the dataframes
    list_of_dfs = [x,y,z]
    #concatenate the dataframes, axis = 1 because they all have the same index, we just want to add the columns together
    concatenated_dataframe = pd.concat(list_of_dfs, axis = 1)
    return concatenated_dataframe

In [None]:
itemized_tweets = clean_df(df).groupby(['airline','Rating']).count().iloc[:,0]

total_tweets = clean_df(df).groupby(['airline'])['Rating'].count()

#Create a dictionary of percentage of positive tweets = (positive_tweets / total_tweets)
my_dict = {'American':itemized_tweets[1] / total_tweets[0],
           'Delta':itemized_tweets[4] / total_tweets[1],
           'Southwest': itemized_tweets[7] / total_tweets[2],
           'US Airways': itemized_tweets[10] / total_tweets[3],
           'United': itemized_tweets[13] / total_tweets[4],
           'Virgin': itemized_tweets[16] / total_tweets[5]}

#make a dataframe from the dictionary
perc_neutral = pd.DataFrame.from_dict(my_dict, orient = 'index')
#Have to manually set column name
perc_neutral.columns = ['Percent Neutral']

#call our function to concatenate all 3 dataframes of percentages
percentage = merge_dfs(perc_neutral, perc_negative, perc_positive)
print(percentage)

#graph all of our data
ax = percentage.plot(kind = 'bar', stacked = True, rot = 0, figsize = (15,6))
#set x label
ax.set_xlabel('Airlines')
#set y label
ax.set_ylabel('Percentages')
#move the legend to the bottom of the graph since it wants to sit over all of our data and block it - stupid legend
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
          fancybox=True, shadow=True, ncol=5)

plt.show()

In [None]:
observation = list(clean_df(df).reset_index().iloc[6750:6755,8])
tweet_text = list(clean_df(df).reset_index().iloc[6750:6755,6])

for pos, item in enumerate(observation):
    print('Airline as compiled: ' + str(item))
    print('The actual tweet text: ')
    print(tweet_text[pos], '\n''\n')
new_df = clean_df(df).iloc[:,0:7]
new_df.head()

In [None]:
new_df['Airline'] = new_df.Text.apply(lambda x: re.findall('\@[A-Za-z]+', x)[0])

#check that our regular expression is working
list(new_df.Airline.head(10))

In [None]:
twitter_tags = np.unique(new_df.Airline, return_counts = True)

#compile twitter_tags so that it lists the unique tag and its total count side by side instead of 2 seperate arrays
twitter_tags_count = list(zip(twitter_tags[0],twitter_tags[1]))
twitter_tags_count

In [None]:
airline_list = ['@virginamerica','@united','@southwestair','@americanair','@jetblue','@usairways']
    
airlines = re.compile('|'.join(airline_list), re.IGNORECASE)
    
new_df['Airline'] = new_df.Airline.apply(lambda x: np.squeeze(re.findall(airlines, x))).str.split('@').str[1]
print(list(new_df.Airline.head(10)))

In [None]:
no_airline = new_df.reset_index()
no_airline = no_airline[no_airline.Airline.isnull()].Text.apply(lambda x: re.findall('\@[A-Za-z]+', x))
no_airline

In [None]:
rating = list(new_df.Rating)
conf = list(new_df.Rating_Conf)
text = list(new_df.Text)

for i in range(10):
    print(rating[i], '\n', conf[i], '\n', text[i],'\n','\n')

In [None]:
conf_df = new_df[new_df.Rating_Conf >= 0.51 ]
print(conf_df.info())
conf_df.head(10)

In [None]:
date = conf_df.reset_index()
#convert the Date column to pandas datetime
date.Date = pd.to_datetime(date.Date)
#Reduce the dates in the date column to only the date and no time stamp using the 'dt.date' method
date.Date = date.Date.dt.date
date.Date.head()

In [None]:
conf_df = date
conf_df.head()

In [None]:
test = conf_df[conf_df.Airline != 'Delta Airlines'].groupby(['Airline','Rating']).count().iloc[:,0]
test

In [None]:
print(conf_df.Date.min())
print(conf_df.Date.max())

In [None]:
day_df = day_df.reset_index()
day_df.head()

In [None]:
day_df = day_df.rename(columns = {0:'Count'})
#filter to only negative ratings
day_df = day_df[day_df.Rating == 'negative'].reset_index()
#Remove delta since it only has 2 entries
day_df = day_df[day_df.Airline != 'Delta Airlines']
day_df.head()

In [None]:
day_df = day_df.iloc[:,1:5]

#groupby and plot data
ax2 = day_df.groupby(['Date','Airline']).sum().unstack().plot(kind = 'bar', colormap = 'viridis', figsize = (15,6), rot = 70)
labels = ['American Airlines','Jet Blue','Southwest Airlines','US Airways','United Airlines','Virgin Airlines']
ax2.legend(labels = labels)
ax2.set_xlabel('Date')
ax2.set_ylabel('Negative Tweets')
plt.show()