In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('bigdatastatistics.csv')

In [3]:
df['image'] = df['image'].fillna(0)
df['video'] = df['video'].fillna(0)

In [4]:
df = df[['text', 'image', 'video', 'likes',
       'comments', 'shares']]

In [5]:
number_of_group_members = 26000

In [6]:
df['engagement_score'] = (df['likes'] + df['comments'] + 2*df['shares'])/number_of_group_members

In [7]:
df.drop(df[df.engagement_score == 0].index, inplace=True)

In [8]:
df = df[['text', 'image', 'video','engagement_score' ]]

In [9]:
def question_mark_check(row):
    #print(row['text'])
    if '?' not in str(row['text']):
        val = 0
    else:
        val = 1
    return val

def hashtag_check(row):
    #print(row['text'])
    if '#' not in str(row['text']):
        val = 0
    else:
        val = 1
    return val

In [10]:
df['question_mark_check'] = df.apply(question_mark_check, axis=1)
df['hashtag_check'] = df.apply(hashtag_check, axis=1)

In [11]:

import re

def url_check(row):

    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.findall(regex,str(row['text']))
    if len(url)>0:
        return 1
    else:
        return 0
	


In [12]:
df['url_check'] = df.apply(url_check, axis=1)

In [13]:
def lexical_diversity(row):
    return len(set(str(row['text']).split())) / len(str(row['text']).split())

In [14]:
df['lexical_diversity'] = df.apply(lexical_diversity, axis=1)

In [15]:
def words_count(row):
    return len(str(row['text']).split())

In [16]:
df['words_count'] = df.apply(words_count, axis=1)

In [17]:
from textblob import TextBlob
def sentiment(row):
    testimonial = TextBlob(str(row['text']))
    return testimonial.sentiment.subjectivity

In [18]:
df['sentiment'] = df.apply(sentiment, axis=1)

In [19]:
from meme_or_not import set_model_for_image_type,get_image_type

In [20]:
model = set_model_for_image_type()



In [21]:
def image_check(row):
    if row['image'] ==0:
        return 0
    else:
        return get_image_type(model,row['image'])
        
        
def image_as_boolean(row):
    if row['image'] !=0:
        if row['image_type'] == -1:
            return 0
        else:
            return 1
    else:
        return 0
    

def video_as_boolean(row):
    if row['video'] != 0:
        return 1
    else:
        return 0
    

In [22]:
df['image_type'] = df.apply(image_check, axis=1)
df['image'] = df.apply(image_as_boolean, axis=1)
df['video'] = df.apply(video_as_boolean, axis=1)
df['image_type'] = df.apply(image_check, axis=1)

In [46]:
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=530, timeout=(10,25), retries=2, backoff_factor=0.1)


def trend_status(row):
    blob = TextBlob(str(row['text']))
 
    words = set()
    for nouns in blob.noun_phrases:
        words.add(nouns)
        
    if len(words) == 0:
        return 0
    kw_list = sorted(list(words),key = len, reverse = True)[:5]
    print(kw_list)
    try:
        pytrends.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='IN', gprop='')
        interest = pytrends.interest_over_time().tail(20)
    
        interest = interest[interest.columns[0]]
    except:
        print(kw_list)
        return 0
    return interest.mean()
    
    

In [47]:
df['trend_score'] = df.apply(trend_status, axis=1)

['free educational resource', 'data scientist ground', 'different backgrounds', 'data science journey', 'genuine job postings']
['storagesolution', 'cloudcomputing', 'tyrone systems', 'dataanalytics', 'cost savings']
['lockdown offer to be an expert in data data science in', 'happie software consultancy', 'finroots @ gmail.com', 'class starts from', 'yesterday']
['job assistance', 'certification', 'anybody', 'digital', 'data']
['techvidvan.com impact', 'techvidvan', 'automobile', 'data']
['databricks certification', 'udemy.com apache spark', 'scala']
['free educational resource', 'data scientist ground', 'different backgrounds', 'data science journey', 'genuine job postings']
['renforcement des capacités du cabinet', 'mbaye ndour', 'formations', 'whatsapp', 'contact']
['data analysis', 'python pdf']
['proper data validation policies', 'analytics alexander sakalosh', 'ml2quantum.com dataiku dss', 'interactive dashboards', 'production deployment']
['certification course', '% job placemen

In [45]:
df.drop(['trend_score'],axis = 1,inplace = True)

In [44]:
df

Unnamed: 0,text,image,video,engagement_score,question_mark_check,hashtag_check,url_check,lexical_diversity,words_count,sentiment,image_type,trend_score
0,"Welcome to the Big Data, Data Science, Data Mi...",0,0,0.008462,0,0,0,0.708571,175,0.414973,0,0.0
1,Cloud computing offers businesses more flexibi...,1,0,3.8e-05,0,1,1,0.5,88,0.25,-1,0.0
7,Happie Software Consultancy and Services\nYest...,1,0,0.000231,0,0,0,0.939394,33,0.0,-1,0.0
8,"Anybody Interested in Data Science, Digital Ma...",0,0,0.001346,0,0,0,1.0,14,0.25,0,0.0
10,TECHVIDVAN.COM\nImpact of Big Data in Automobi...,1,0,3.8e-05,0,0,0,1.0,10,0.1,-1,0.0
16,UDEMY.COM\nApache Spark with Scala useful for ...,1,0,3.8e-05,0,0,0,1.0,9,0.0,-1,0.6
21,"Welcome to the Big Data, Data Science, Data Mi...",0,0,0.008462,0,0,0,0.708571,175,0.414973,0,0.0
22,Mbaye Ndour\n16 hrs ·\n\nFormations et renforc...,0,0,0.0005,0,0,0,0.966667,30,0.0,0,0.0
26,Data Analysis From with Python\nPDF link: http...,0,0,0.000231,0,0,1,1.0,8,0.0,0,81.05
29,ML2QUANTUM.COM\nDataiku DSS - Data Science Stu...,0,0,0.000538,0,0,0,0.768293,164,0.476282,0,0.0
