# Twitter Airline Sentiment Analysis

## Load the dataset

In [2]:
import numpy as np
import pandas as pd

In [9]:
train_data = pd.read_csv("twitter_train.csv", delimiter = ",")
train_data

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...
10975,569934458364813313,neutral,American,,Cottopanama85,,0,@AmericanAir followback,,2015-02-23 10:58:58 -0800,"ohio,panama",
10976,568564006329434113,positive,United,,PaulBEsteves,,0,@united thanks for the help. Wish the phone re...,,2015-02-19 16:13:17 -0800,Brooklyn,Eastern Time (US & Canada)
10977,569643648910028801,negative,US Airways,,runfixsteve,,0,@usairways the. Worst. Ever. #dca #customerser...,,2015-02-22 15:43:24 -0800,"St. Augustine, Florida",
10978,568864981917110272,negative,US Airways,,CLChicosky,,0,@nrhodes85: look! Another apology. DO NOT FLY ...,,2015-02-20 12:09:15 -0800,,


In [10]:
test_data = pd.read_csv("twitter_test.csv", delimiter=",")
test_data

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...
3655,570304244001193984,US Airways,,Anthony_Scerri,,0,@USAirways Been stuck for 40+ minutes due to l...,,2015-02-24 11:28:22 -0800,"Astoria, NY",Quito
3656,567847737061941249,US Airways,,mttdprkr,,0,@USAirways 4 hours... 4 hours... FOUR HOURS. ...,,2015-02-17 16:47:05 -0800,"Vancouver, WA",Pacific Time (US & Canada)
3657,567823564167192576,Virgin America,,miaerolinea,,1,Nice RT @VirginAmerica: The man of steel might...,,2015-02-17 15:11:02 -0800,Worldwide,Caracas
3658,570273819287531520,American,,GoldensPleasure,,0,@AmericanAir Aww Thanks AA..DFW was on GMA up ...,,2015-02-24 09:27:28 -0800,East Coast CT.,Central Time (US & Canada)


## Import the necessary libraries

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

In [15]:
X_train = train_data['text']
y_train = train_data['airline_sentiment']

In [16]:
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [17]:
# create a TF-IDF vetorizer and a logistic regression pipeline
model = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))

In [18]:
# Fit the model on the training split
model.fit(X_train_split, y_train_split)

In [19]:
# Validate the model on the test split
y_test_pred =  model.predict(X_test_split)

In [20]:
# Calculate accuracy on the validation set
val_accuracy = accuracy_score(y_test_split, y_test_pred)
val_accuracy

0.7950819672131147

In [21]:
# Make prediction on the test data
X_test = test_data['text']
test_prediction = model.predict(X_test)

In [23]:
# Save the predictions to a CSV file (as per the instruction, no headers and only one column)
output = pd.DataFrame(test_prediction)
output.to_csv('twitter_sentiment_predictions.csv', index=False, header=False)


In [24]:
output.head()

Unnamed: 0,0
0,negative
1,negative
2,negative
3,negative
4,positive
