In [62]:
#The problem statement
#There are three parts to this question:

#1. Fetch comments or replies to any social media platform of your choice - Twitter, Facebook, reddit, etc.
#The only criteria is that there needs to be a message and a "rating",
#might be the likes or dislike on the message, but it should be reduceable to a single integer.
#How you reduce it depends on the platform you choose and the reasoning behind the reduction.

#2.Extract the most important "keywords" or "key phrases" in the reviews you fetch.
#They don't have to be per comment, but have to be the most important keywords or phrases per selected social post. 
#In essence, reading the list of keywords or phrases you extract should give a reader a rough insight
#into what people are discussing in the comments without reading all the comments.

#3.Given a bit of text, rank it according to the likelihood of it becoming popular for the given post,
#i.e. gets a high rating in (1) above. The levels of ranks you choose are up to you, at least three is expected.


In [75]:
#Step - 1
#1. Fetch comments or replies to any social media platform of your choice - Twitter, Facebook, reddit, etc.
#The only criteria is that there needs to be a message and a "rating",
#might be the likes or dislike on the message, but it should be reduceable to a single integer.
#How you reduce it depends on the platform you choose and the reasoning behind the reduction.

#Social Media Platform Used - Twitter
#The ranking criteria is the no. of retweets to a tweet 

#Import the libraries

import tweepy #For fetching tweets from twitter
from textblob import TextBlob  #For sentiment analysis
import config

#from twython import Twython, TwythonError

# create a Twython object by passing the necessary secret passwords
#twitter = Twython(config.consumer_key, config.consumer_secret, config.access_token, config.access_token_secret)

#Authentication with twitter
auth = tweepy.OAuthHandler(config.consumer_key,config.consumer_secret)
auth.set_access_token(config.access_token,config.access_token_secret)

api=tweepy.API(auth)



In [64]:
#Search for tweets containing a particular word from the past 7 days using the twitter api 

public_tweets = api.search('India Football')

#Find the most popular tweet
#max_retweeted = 0
#for tweet in public_tweets:
#    if tweet.retweet_count > max_retweeted:
#        max_retweeted = tweet.retweet_count
#        popular_tweet_id = tweet.id
        
for tweet in public_tweets:
    print("***Tweet Text->>",tweet.text)
    analysis = TextBlob(tweet.text)
    print("***Tweet Sentiment->>",analysis.sentiment)
    print("Retweet count: ",tweet.retweet_count)
    print("\n")

    


***Tweet Text->> What Jurgen Klopp said about Rhian Brewster will excite Liverpool fans https://t.co/wYsHgXM9sZ via @ScoopLiverpool https://t.co/LbyEvsjfQV
***Tweet Sentiment->> Sentiment(polarity=0.0, subjectivity=0.0)
Retweet count:  0


***Tweet Text->> India on course to topple China as best attended FIFA U-17 World Cup https://t.co/NVO86Ac4em
***Tweet Sentiment->> Sentiment(polarity=1.0, subjectivity=0.3)
Retweet count:  0


***Tweet Text->> New post (FIFA U-17 World Cup 2017 India vs Ghana LIVE Football Match Score: Colts ...) has been published on  -… https://t.co/NavTZNyRJq
***Tweet Sentiment->> Sentiment(polarity=0.13636363636363635, subjectivity=0.4772727272727273)
Retweet count:  0


***Tweet Text->> AFC general secretary Dato' Windsor wants one football league in India from next season https://t.co/PE3VMpcD5S… https://t.co/74EiNHusiE
***Tweet Sentiment->> Sentiment(polarity=0.08333333333333333, subjectivity=0.19999999999999998)
Retweet count:  0


***Tweet Text->> RT @blue2

In [70]:
#Step - 2.Extract the most important "keywords" or "key phrases" in the reviews you fetch.
#They don't have to be per comment, but have to be the most important keywords or phrases per selected social post. 
#In essence, reading the list of keywords or phrases you extract should give a reader a rough insight
#into what people are discussing in the comments without reading all the comments.


In [71]:
#First Method - Manually

#Text Preprocessing on the public_tweets

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

#Removing the stop words ,punctutaion
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')


for tweet in public_tweets:
    
    #Lowercase the tweet and Remove the punctuations using the RegexpTokenizer
    lowers = tweet.text.lower()
    no_punctuation_tweet_text = tokenizer.tokenize(lowers)
    
    #Removing Stop-words currently only for English Tweets
    refined_tweet = [w for w in no_punctuation_tweet_text if not w in stop_words]
    refined_tweet = []

    for w in no_punctuation_tweet_text:
        if w not in stop_words:
            refined_tweet.append(w)
    
    print("Tokenized_tweet:-", no_punctuation_tweet_text)
    print("\n")
    print("Refined_tweet:-",refined_tweet)
    print("\n\n")
    

Tokenized_tweet:- ['what', 'jurgen', 'klopp', 'said', 'about', 'rhian', 'brewster', 'will', 'excite', 'liverpool', 'fans', 'https', 't', 'co', 'wyshgxm9sz', 'via', 'scoopliverpool', 'https', 't', 'co', 'lbyevsjfqv']


Refined_tweet:- ['jurgen', 'klopp', 'said', 'rhian', 'brewster', 'excite', 'liverpool', 'fans', 'https', 'co', 'wyshgxm9sz', 'via', 'scoopliverpool', 'https', 'co', 'lbyevsjfqv']



Tokenized_tweet:- ['india', 'on', 'course', 'to', 'topple', 'china', 'as', 'best', 'attended', 'fifa', 'u', '17', 'world', 'cup', 'https', 't', 'co', 'nvo86ac4em']


Refined_tweet:- ['india', 'course', 'topple', 'china', 'best', 'attended', 'fifa', 'u', '17', 'world', 'cup', 'https', 'co', 'nvo86ac4em']



Tokenized_tweet:- ['new', 'post', 'fifa', 'u', '17', 'world', 'cup', '2017', 'india', 'vs', 'ghana', 'live', 'football', 'match', 'score', 'colts', 'has', 'been', 'published', 'on', 'https', 't', 'co', 'navtznyrjq']


Refined_tweet:- ['new', 'post', 'fifa', 'u', '17', 'world', 'cup', '2017',

In [72]:
#Second Method - Using RAKE (Rapid Automatic Keyword Extraction ALgorithm)
from rake_nltk import Rake
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.

#If you want to provide your own set of stop words and punctuations to
r = Rake(['co','rt','https','t'],[':','//','://','@','.',])

import pandas as pd
import numpy as np

columns = ['key1','key2','key3','key4','key5']
df = pd.DataFrame(columns=columns)
df = df.fillna(0) # with 0s rather than NaNs

print("Total tweets:==",len(public_tweets))
print('\n')

for tweet in public_tweets:
    
    keywords = r.extract_keywords_from_text(tweet.text)
    keyphrases = r.get_ranked_phrases()# To get keyword phrases ranked highest to lowest.
    print("Keyphrases:-->",keyphrases)
    keyphrases_scored = r.get_ranked_phrases_with_scores()# To get keyword phrases ranked highest to lowest with scores.
    print("Keyphrases with scores:-->",keyphrases_scored)
    print("\n")
    
    columns = ['key1','key2','key3','key4','key5']
    list_elements = []
    list_temp = []
      
    #Create a Dataframe from the given keyphrase scores
    for i in range (len(keyphrases_scored)):
        list_elements.append(keyphrases_scored[i][0])
        
    for i in range (0,5):
        if(i<len(list_elements)):
            list_temp.append(list_elements[i])
        else:
            list_temp.append(0)
            
    df=df.append(pd.DataFrame([list_temp],columns = ['key1','key2','key3','key4','key5']),ignore_index='true')
    
    
print("Dataframe=\n\n",df)

Total tweets:== 15


Keyphrases:--> ['what jurgen klopp said about rhian brewster will excite liverpool fans', '/ wyshgxm9sz via', '/ lbyevsjfqv', 'scoopliverpool']
Keyphrases with scores:--> [(121.0, 'what jurgen klopp said about rhian brewster will excite liverpool fans'), (8.5, '/ wyshgxm9sz via'), (4.5, '/ lbyevsjfqv'), (1.0, 'scoopliverpool')]


Keyphrases:--> ['india on course to topple china as best attended fifa u - 17 world cup', '/ nvo86ac4em']
Keyphrases with scores:--> [(225.0, 'india on course to topple china as best attended fifa u - 17 world cup'), (4.0, '/ nvo86ac4em')]


Keyphrases:--> ['new post ( fifa u - 17 world cup 2017 india vs ghana live football match score', 'colts ...) has been published on -…', '/ navtznyrjq']
Keyphrases with scores:--> [(289.0, 'new post ( fifa u - 17 world cup 2017 india vs ghana live football match score'), (49.0, 'colts ...) has been published on -…'), (4.0, '/ navtznyrjq')]


Keyphrases:--> ["afc general secretary dato ' windsor wants o

In [73]:
#Step-3.Given a bit of text, rank it according to the likelihood of it becoming popular for the given post,
#i.e. gets a high rating in (1) above. The levels of ranks you choose are up to you, at least three is expected.



#Approach - Create a dataframe from the above extracted scores of the tweets keyphrases(Top 5 keyphrases of each tweet)
#Extract keywords/keyphrases with scores from the given text(input)(Read it from a text file)
#Build a predictive model and train it with the fetched tweets keyphrases (5 keyphrases per tweet) with 
#Y = No. of retweets to that tweet and prdict the no. of retweets to a particular retweet  
#To rank the new tweet divide the Y in 3 classes 
#1-> 0-50 retweets 
#2-> 50-500 retweets
#3-> 500+ retweets


retweets = []

for tweet in public_tweets:
    retweets.append(tweet.retweet_count)

se = pd.Series(retweets)
df['Retweets'] = se.values
print(df)


     key1  key2  key3  key4 key5  Retweets
0   121.0   8.5   4.5   1.0    0         0
1   225.0   4.0   0.0   0.0    0         0
2   289.0  49.0   4.0   0.0    0         0
3   225.0   8.5   4.5   0.0    0         0
4   961.0   1.0   0.0   0.0    0        78
5   400.0   4.0   0.0   0.0    0         0
6     9.0   4.0   1.0   1.0    1         0
7   192.5  52.5   4.0   0.0    0         0
8   140.0  20.0   4.0   4.0    0         0
9    64.0   0.0   0.0   0.0    0         0
10   81.0  79.5  37.5   0.0    0         0
11  660.0  20.0   0.0   0.0    0         0
12   64.0  64.0  16.0   9.0    1       801
13   64.0  64.0  16.0   9.0    1       801
14  400.0   4.0   4.0   0.0    0         0


In [74]:
# Here comes my multi regression model which explains model calculating the scores 
import statsmodels.api as sm # import statsmodels 
X = df[["key1","key2","key3","key4","key5"]]
#list(df.columns.values)
y = df["Retweets"]
#print (y)
model = sm.OLS(y.astype(float), X.astype(float)).fit() 
predictions = model.predict(X)
model.summary()


  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,Retweets,R-squared:,0.902
Model:,OLS,Adj. R-squared:,0.852
Method:,Least Squares,F-statistic:,18.31
Date:,"Thu, 26 Oct 2017",Prob (F-statistic):,9.55e-05
Time:,01:50:19,Log-Likelihood:,-89.111
No. Observations:,15,AIC:,188.2
Df Residuals:,10,BIC:,191.8
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
key1,0.0138,0.088,0.156,0.879,-0.183,0.211
key2,0.1314,1.923,0.068,0.947,-4.154,4.416
key3,0.1699,5.220,0.033,0.975,-11.460,11.800
key4,70.1864,16.315,4.302,0.002,33.835,106.538
key5,81.2044,113.112,0.718,0.489,-170.826,333.235

0,1,2,3
Omnibus:,14.565,Durbin-Watson:,1.8
Prob(Omnibus):,0.001,Jarque-Bera (JB):,11.12
Skew:,-1.652,Prob(JB):,0.00385
Kurtosis:,5.623,Cond. No.,1410.0
