In [1]:
import pandas as pd
import re
from textblob import TextBlob
from meme_or_not import set_model_for_image_type,get_image_type
from pytrends.request import TrendReq

pytrends = TrendReq(hl='en-US', tz=530, timeout=(10,25), retries=3, backoff_factor=0.2)


In [2]:
def question_mark_check(row):
    #print(row['text'])
    if '?' not in str(row['text']):
        val = 0
    else:
        val = 1
    return val

def hashtag_check(row):
    #print(row['text'])
    if '#' not in str(row['text']):
        val = 0
    else:
        val = str(row['text']).count('#')
    return val

def url_check(row):

    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.findall(regex,str(row['text']))
    if len(url)>0:
        return 1
    else:
        return 0
    
def words_count(row):
    return len(str(row['text']).split())

def lexical_diversity(row):
    return len(set(str(row['text']).split())) / len(str(row['text']).split())


def sentiment(row):
    testimonial = TextBlob(str(row['text']))
    return testimonial.sentiment.polarity


def image_check(row):
    if row['image'] ==0:
        return 0
    else:
        return get_image_type(model,row['image'])
        
        
def image_as_boolean(row):
    if row['image'] !=0:
        if row['image_type'] == -1:
            return 0
        else:
            return 1
    else:
        return 0
    

def video_as_boolean(row):
    if row['video'] != 0:
        return 1
    else:
        return 0
    
def trend_status(row):
    
    from rake_nltk import Rake

    r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.

    r.extract_keywords_from_text(str(row['text']))
 
    words = r.get_ranked_phrases()
    
    proper_nouns = set()
    tb = TextBlob(str(row['text']))
    for i in tb.tags:
        if i[1] == 'NNP':
            proper_nouns.add(str(i[0]))
            
    proper_nouns = sorted(list(proper_nouns),key = len, reverse = True)[:5]
    
            
        
    if len(words) == 0:
        return 0
    kw_list = sorted(list(words),key = len, reverse = True)[:5] + proper_nouns
    #print(kw_list)
    try:
        pytrends.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='IN', gprop='')
        interest = pytrends.interest_over_time().tail(20)
    
        interest = interest[interest.columns[0]]
    except:
        if len(proper_nouns) < 1:
            return 0
        #print(proper_nouns)
        pytrends.build_payload(proper_nouns, cat=0, timeframe='today 5-y', geo='IN', gprop='')
        interest = pytrends.interest_over_time().tail(20)
        
        try:
            interest = interest[interest.columns[0]]
        except:
            return 0
        
    x = interest.mean()
    if x == 0:
        #print("no interest")
        if len(proper_nouns) < 1:
            return 0
        #print(proper_nouns)
        pytrends.build_payload(proper_nouns, cat=0, timeframe='today 5-y', geo='IN', gprop='')
        interest = pytrends.interest_over_time().tail(20)
        
        try:
            interest = interest[interest.columns[0]]
        except:
            print('fail')
            return 0
        return interest.mean()
    return x


def text_boolean(row):
    #print(row['text'])
    if len(str(row['text'])) > 0:
        val = 1
    else:
        val = 0
    return val
    
    

In [3]:
df = pd.read_csv('bigdatastatistics.csv')

In [4]:
df['image'] = df['image'].fillna(0)
df['video'] = df['video'].fillna(0)

In [5]:
df = df[['text', 'image', 'video', 'likes',
       'comments', 'shares']]

In [6]:
number_of_group_members = 26000

In [7]:
df['engagement_score'] = (df['likes'] + df['comments'] + 2*df['shares'])/number_of_group_members

In [8]:
df.drop(df[df.engagement_score == 0].index, inplace=True)

In [9]:
df = df[['text', 'image', 'video','engagement_score' ]]

In [10]:
df['question_mark_check'] = df.apply(question_mark_check, axis=1)
df['hashtag_check'] = df.apply(hashtag_check, axis=1)

In [11]:
df['url_check'] = df.apply(url_check, axis=1)

In [12]:
df['lexical_diversity'] = df.apply(lexical_diversity, axis=1)

In [13]:
df['words_count'] = df.apply(words_count, axis=1)

In [14]:
df['sentiment'] = df.apply(sentiment, axis=1)

In [15]:
model = set_model_for_image_type()



In [16]:
df['image_type'] = df.apply(image_check, axis=1)
df['image'] = df.apply(image_as_boolean, axis=1)
df['video'] = df.apply(video_as_boolean, axis=1)
df['image_type'] = df.apply(image_check, axis=1)

In [17]:
df['trend_score'] = df.apply(trend_status, axis=1)

In [18]:
df['text'] = df.apply(text_boolean, axis=1)

In [19]:
df = df[['text','question_mark_check',
       'hashtag_check', 'url_check', 'lexical_diversity', 'words_count',
       'sentiment', 'image_type', 'trend_score','image','image_type','video','engagement_score']]

In [20]:
df

Unnamed: 0,text,question_mark_check,hashtag_check,url_check,lexical_diversity,words_count,sentiment,image_type,trend_score,image,image_type.1,video,engagement_score
0,1,0,0,0,0.708571,175,0.250668,0,4.65,0,0,0,0.008462
1,1,0,9,1,0.5,88,0.3,-1,0.0,1,-1,0,3.8e-05
7,1,0,0,0,0.939394,33,0.0,-1,1.85,1,-1,0,0.000231
8,1,0,0,0,1.0,14,0.125,0,6.9,0,0,0,0.001346
10,1,0,0,0,1.0,10,0.0,-1,0.0,1,-1,0,3.8e-05
16,1,0,0,0,1.0,9,0.3,-1,20.6,1,-1,0,3.8e-05
21,1,0,0,0,0.708571,175,0.250668,0,4.65,0,0,0,0.008462
22,1,0,0,0,0.966667,30,0.0,0,0.0,0,0,0,0.0005
26,1,0,0,1,1.0,8,0.0,0,10.35,0,0,0,0.000231
29,1,0,0,0,0.768293,164,0.161538,0,0.0,0,0,0,0.000538


In [21]:
df.to_csv('sample_training_set.csv')