# iPhoneX Twitter Data Exploratory Analysis

These are tweets collected from the Tweepy API for occurences of the therm 'iPhoneX' in tweets.

In [1]:
# Import dependencies
import pandas
import functionality as func
import datetime
import numpy as np
from sklearn import preprocessing
from scipy import stats
from scipy.stats import chisquare
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from sklearn.preprocessing import Imputer
from datetime import datetime
import re




### Loading the data

In [2]:
iPhoneTweetsBefore = pandas.read_csv('iPhoneX_Old.csv')
print('Before Dimensions:')
print('\tNumber of Keys (Columns): ' + str(iPhoneTweetsBefore.shape[0]))
print('\tNumber of Entries (Rows): ' + str(iPhoneTweetsBefore.shape[1]))

iPhoneTweetsAfter = pandas.read_csv('iPhoneX_Bet.csv')
iPhoneTweetsAfter2 = pandas.read_csv('iPhoneX_New.csv')

iPhoneTweetsAfter = pandas.concat([iPhoneTweetsAfter, iPhoneTweetsAfter2])

print('After Dimensions:')
print('\tNumber of Keys (Columns): ' + str(iPhoneTweetsAfter.shape[0]))
print('\tNumber of Entries (Rows): ' + str(iPhoneTweetsAfter.shape[1]))

Before Dimensions:
	Number of Keys (Columns): 182509
	Number of Entries (Rows): 10
After Dimensions:
	Number of Keys (Columns): 232341
	Number of Entries (Rows): 12


### Dataset Statistics

In [3]:
iPhoneTweetsBefore.head()

Unnamed: 0.1,Unnamed: 0,date,favorites,geo,hashtags,id,mentions,retweets,text,username
0,0,2017-11-01 16:59,0,,,"""925875092533284864""\r\n",,0,"""iPhoneX 、iPhone8就像一场行为艺术互相映衬彼此的价值，一个趋向于利润和经营，...",loverty
1,1,2017-11-01 16:59,0,,,"""925875086741012480""\r\n",,0,"""今日メール来ないと iPhoneX 発売日に買えないじゃん(･_･)ヨドバシさーーーーん(...",bzf502
2,2,2017-11-01 16:59,0,,,"""925875084228784128""\r\n",,0,"""The iPhone X so damn trash""",wmjj5
3,3,2017-11-01 16:59,1,,#iPhoneX #iMacPro #PodernFamily,"""925875084098650112""\r\n",@applesliceau,2,"""Ep 38 @applesliceau - #iPhoneX Pre-order Pani...",applesliceau
4,4,2017-11-01 16:59,0,,,"""925875074950983680""\r\n",,0,"""Oh stop it was me she was bringing the iphone...",BenRebellious


In [4]:
iPhoneTweetsAfter.head()

Unnamed: 0.1,0,Unnamed: 0,date,favorites,geo,hashtags,id,id.1,mentions,retweets,text,username
0,1.0,,2017-11-06 15:59,0,,,,"""927687029952937984""\r\n",,0,"""プラダ アイフォンx/8 プラス 手帳型ケース ブランドPrada iphone x /8...",yuchimaretsujiy
1,2.0,,2017-11-06 15:59,0,,,,"""927687028984156161""\r\n",,0,"""★Winning An iPhone X From The Arcade Claw Mac...",NatsuSinging1
2,3.0,,2017-11-06 15:59,0,,#iPhoneXfree,,"""927687027130109952""\r\n",,0,"""Me voy a llevar un iPhone X GRATIS a casa #iP...",AngelPe16777328
3,4.0,,2017-11-06 15:59,0,,#iPhoneXfree,,"""927687024051499010""\r\n",,0,"""Me voy a llevar un iPhone X GRATIS a casa #iP...",Eduardo17017953
4,5.0,,2017-11-06 15:59,0,,#iPhoneXfree,,"""927687021140652032""\r\n",,0,"""Me voy a llevar un iPhone X GRATIS a casa #iP...",gabofragma


In [5]:
def getDate(row) : 
    date = row['date'].split(" ")
    return(str(date[0]))

In [6]:
iPhoneTweetsBefore['dates'] = iPhoneTweetsBefore.apply (lambda row: getDate (row), axis=1)
iPhoneTweetsBefore['dates'].unique()

array(['2017-11-01', '2017-10-31', '2017-10-30', '2017-10-29', '2017-10-28'], dtype=object)

In [7]:
iPhoneTweetsAfter['dates'] = iPhoneTweetsAfter.apply (lambda row: getDate (row), axis=1)
iPhoneTweetsAfter['dates'].unique()

array(['2017-11-06', '2017-11-05', '2017-11-04', '2017-11-09', '2017-11-08'], dtype=object)

In [8]:
def getLang(row) :
    en = False
    if row['text'] != np.NaN or row['text'] != None :
        tweetText = row['text']
        try :
            language = detect(tweetText)      
            if language == 'en' :
                en = True
        except :
            pass
    return(en)

In [9]:
iPhoneTweetsBefore['en'] = iPhoneTweetsBefore.apply (lambda row: getLang (row),axis=1)
iPhoneTweetsBefore['en'].describe()

count     182509
unique         2
top         True
freq      108677
Name: en, dtype: object

In [10]:
iPhoneTweetsAfter['en'] = iPhoneTweetsAfter.apply (lambda row: getLang (row),axis=1)
iPhoneTweetsAfter['en'].describe()

count     232341
unique         2
top         True
freq      137791
Name: en, dtype: object

## Sentiment Scores

### AFINN-111 Sentiment Calculation

In [11]:
sent_file = open("AFINN-111.txt")
termScores = {}
for line in sent_file :
    term, score = line.split("\t")
    termScores[term] = int(score)

def afinnSentiment(row) :
    sentimentScore = np.NaN
    if row['text'] != np.NaN or row['text'] != None :
        tweetText = row['text']  
        if row['en'] :
            tweetText = tweetText.lower()
            sentimentScore = 0
            for term in tweetText.split() :
                if term in termScores.keys() :
                    sentimentScore += termScores[term]
    return(sentimentScore)

### AFINN-111 Before Launch, Raw Sentiment Scores

In [12]:
iPhoneTweetsBefore['afinnSentiment'] = iPhoneTweetsBefore.apply (lambda row: afinnSentiment (row),axis=1)
iPhoneTweetsBefore['afinnSentiment'].describe()

count    108677.000000
mean          0.327135
std           1.523153
min         -16.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          17.000000
Name: afinnSentiment, dtype: float64

In [13]:
# Median
np.nanmedian(iPhoneTweetsBefore['afinnSentiment'])

0.0

In [14]:
# Mode
stats.mode(iPhoneTweetsBefore['afinnSentiment'])[0][0]

0.0

In [15]:
# Range
np.ptp(iPhoneTweetsBefore['afinnSentiment'])

33.0

### AFINN-111 After Launch, Raw Sentiment Scores

In [16]:
iPhoneTweetsAfter['afinnSentiment'] = iPhoneTweetsAfter.apply (lambda row: afinnSentiment (row),axis=1)
iPhoneTweetsAfter['afinnSentiment'].describe()

count    137791.000000
mean          0.423446
std           1.709810
min         -27.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          28.000000
Name: afinnSentiment, dtype: float64

In [17]:
# Median
np.nanmedian(iPhoneTweetsAfter['afinnSentiment'])

0.0

In [18]:
# Mode
stats.mode(iPhoneTweetsAfter['afinnSentiment'])[0][0]

0.0

In [19]:
# Range
np.ptp(iPhoneTweetsAfter['afinnSentiment'])

55.0

### AFINN-111 Chi-Square Test of Significance, Raw Sentiment Scores Before & After Launch

In [20]:
beforeMean = np.nanmean(iPhoneTweetsBefore['afinnSentiment'])
afterMean = np.nanmean(iPhoneTweetsAfter['afinnSentiment'])

obs = np.array([[0, 1], [beforeMean, afterMean]]).T
obs.shape
chisquare(obs, axis=None)

Power_divergenceResult(statistic=1.1886127335704135, pvalue=0.75573672427445537)

### AFINN-111 Before Launch, Normalized Sentiment Scores

In [21]:
x = iPhoneTweetsBefore['afinnSentiment']
x = x.values.reshape(-1, 1)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(x)
x = imp.transform(x)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
iPhoneTweetsBeforeNormalized = pandas.DataFrame(x_scaled)
iPhoneTweetsBeforeNormalized.describe()

Unnamed: 0,0
count,182509.0
mean,0.494762
std,0.035617
min,0.0
25%,0.484848
50%,0.494762
75%,0.494762
max,1.0


### AFINN-111 After Launch, Normalized Sentiment Scores

In [22]:
x = iPhoneTweetsAfter['afinnSentiment']
x = x.values.reshape(-1, 1)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(x)
x = imp.transform(x)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
iPhoneTweetsAfterNormalized = pandas.DataFrame(x_scaled)
iPhoneTweetsAfterNormalized.describe()

Unnamed: 0,0
count,232341.0
mean,0.498608
std,0.02394
min,0.0
25%,0.490909
50%,0.498608
75%,0.498608
max,1.0


### AFINN-111 Chi-Square Test of Significance, Normalized Sentiment Scores Before & After Launch

In [23]:
beforeMean = np.nanmean(iPhoneTweetsBeforeNormalized)
afterMean = np.nanmean(iPhoneTweetsAfterNormalized)

obs = np.array([[0, 1], [beforeMean, afterMean]]).T
obs2 = np.array([[0, 1], [beforeMean, afterMean]])
obs.shape
chisquare(obs, axis=None)

Power_divergenceResult(statistic=1.0033630464608687, pvalue=0.80043819887815837)

# NLTK (Python Library) Vader Sentiment Calculation

In [24]:
vader = SentimentIntensityAnalyzer()
def nltkSentiment(row) :
    sentimentScore = np.NaN
    if row['text'] != np.NaN or row['text'] != None :
        tweetText = row['text']
        if row['en'] :
            tweetText = tweetText.lower()
            scores = vader.polarity_scores(tweetText)
            sentimentScore = scores['pos'] - scores['neg']
    return(sentimentScore)

### NLTK Before Launch, Raw Sentiment Scores

In [25]:
iPhoneTweetsBefore['nltkSentiment'] = iPhoneTweetsBefore.apply (lambda row: nltkSentiment (row), axis=1)
iPhoneTweetsBefore['nltkSentiment'].describe()

count    108677.000000
mean          0.042300
std           0.136652
min          -0.798000
25%           0.000000
50%           0.000000
75%           0.120000
max           0.851000
Name: nltkSentiment, dtype: float64

In [26]:
# Median
np.nanmedian(iPhoneTweetsBefore['nltkSentiment'])

0.0

In [27]:
# Mode
stats.mode(iPhoneTweetsBefore['nltkSentiment'])[0][0]

0.0

In [28]:
# Range
np.ptp(iPhoneTweetsBefore['nltkSentiment'])

1.649

### NLTK After Launch, Raw Sentiment Scores

In [29]:
iPhoneTweetsAfter['nltkSentiment'] = iPhoneTweetsAfter.apply (lambda row: nltkSentiment (row), axis=1)
iPhoneTweetsAfter['nltkSentiment'].describe()

count    137791.000000
mean          0.063255
std           0.153043
min          -0.857000
25%           0.000000
50%           0.000000
75%           0.130000
max           1.000000
Name: nltkSentiment, dtype: float64

In [30]:
# Median
np.nanmedian(iPhoneTweetsAfter['nltkSentiment'])

0.0

In [31]:
# Mode
stats.mode(iPhoneTweetsAfter['nltkSentiment'])[0][0]

0.0

In [32]:
# Range
np.ptp(iPhoneTweetsAfter['nltkSentiment'])

1.857

### NLTK Chi-Square Test of Significance, Raw Sentiment Scores Before & After Launch

In [33]:
beforeMean = np.nanmean(iPhoneTweetsBefore['nltkSentiment'])
afterMean = np.nanmean(iPhoneTweetsAfter['nltkSentiment'])

obs = np.array([[0, 1], [beforeMean, afterMean]]).T
obs.shape
chisquare(obs, axis=None)

Power_divergenceResult(statistic=2.5334882593754511, pvalue=0.4692693936842951)

### NLTK Before Launch, Normalized Sentiment Scores

In [34]:
x = iPhoneTweetsBefore['nltkSentiment'].copy()
x = x.values.reshape(-1, 1)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(x)
x = imp.transform(x)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
iPhoneTweetsBeforeNormalized = pandas.DataFrame(x_scaled)
iPhoneTweetsBeforeNormalized.describe()

Unnamed: 0,0
count,182509.0
mean,0.509582
std,0.063947
min,0.0
25%,0.48393
50%,0.509582
75%,0.509582
max,1.0


### NLTK After Launch, Normalized Sentiment Scores

In [35]:
x = iPhoneTweetsAfter['nltkSentiment'].copy()
x = x.values.reshape(-1, 1)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(x)
x = imp.transform(x)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
iPhoneTweetsAfterNormalized = pandas.DataFrame(x_scaled)
iPhoneTweetsAfterNormalized.describe()


Unnamed: 0,0
count,232341.0
mean,0.49556
std,0.063467
min,0.0
25%,0.461497
50%,0.49556
75%,0.504039
max,1.0


### NLTK Chi-Square Test of Significance, Normalized Sentiment Scores Before & After Launch

In [36]:
beforeMean = np.nanmean(iPhoneTweetsBeforeNormalized)
afterMean = np.nanmean(iPhoneTweetsAfterNormalized)

obs = np.array([[0, 1], [beforeMean, afterMean]]).T
obs.shape
chisquare(obs, axis=None)

Power_divergenceResult(statistic=0.99764510686765828, pvalue=0.80182177183496139)

## Saves files to CSV

In [46]:
header = ['date', 'dates', 'nltkSentiment', 'afinnSentiment']
iPhoneTweetsAfter.to_csv("after.csv", columns = header, header = True)

iPhoneTweetsBefore.to_csv("before.csv", columns = header, header = True)

both = pandas.concat([iPhoneTweetsAfter, iPhoneTweetsBefore])
both.to_csv("both.csv", columns = header, header = True)


In [47]:
beforeMean = np.nanmean(iPhoneTweetsBefore['nltkSentiment'])
afterMean = np.nanmean(iPhoneTweetsAfter['nltkSentiment'])
nltk_BA = pandas.DataFrame([[0, 1], [beforeMean, afterMean]])
nltk_BA.to_csv("nltk_BA.csv", header = True)

In [48]:
beforeMean = np.nanmean(iPhoneTweetsBefore['afinnSentiment'])
afterMean = np.nanmean(iPhoneTweetsAfter['afinnSentiment'])
afinn_BA = pandas.DataFrame([[0, 1], [beforeMean, afterMean]])
afinn_BA.to_csv("afinn_BA.csv", header = True)