## Team Members

1. Siva Sai Kumari Maineni
2. Mahathi Mandapati
3. Yang Wang
4. Kang (Kevin) Hou
5. Kinnary Uday Panchal
6. Aboorva Erode Baskaran

**Imports**

In [None]:
#!/usr/bin/python3.6
#Install LDA library if not already installed
# pip3.6 install --user lda
# the input data file is natgeo_labels_2020.xlsx (a 2-column file with id as column 1 and text in column 2)
# there are two output files: topic_word_dist.xlsx and document_topic_dist.xlsx
# the script prompts for the name of the columns -- in the data file, I named them id and labels. You are then prompted for # topics. 4 or 5 may be a good starting point.



import os, csv, nltk, json, lda
import warnings
import pandas as pd
import numpy as np
import requests

from os import listdir
from os.path import isfile, join
from collections import Counter
# Imports the Google Cloud client library
from google.cloud import vision
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer,word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_predict, cross_val_score



nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

warnings.simplefilter(action='ignore', category=FutureWarning)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/kinnarypanchal/Downloads/uda-assignment-4-96341864fcec.json"

**Scraping**

##### **We have used the online scrapper call phantombuster to scrape the NatGeo insta page**

In [None]:
df = pd.read_json('insta_post.json')
df_photo =  df[df['type'] == 'Photo']
# df_photo = df_photo.drop_duplicates(subset='postUrl', keep="first")
new_img_list =  df_photo['postUrl']

# This step enable us to access the image after the Instagram url expiration date
imgUrl_list =  [s + 'media/?size=l' for s in new_img_list]
len(imgUrl_list)
new_url_df = pd.DataFrame(imgUrl_list, columns=['url'])
df_photo['imgUrl'] = new_url_df['url'].values

df_photo.to_json('insta_post1.json', orient= 'records')

**Image Labeling**

In [None]:
# Opening JSON file
f = open('insta_post1.json')
# returns JSON object as a dictionary
data = json.load(f)

In [None]:
def detect_labels_uri(uri):
    """Detects labels in the file located in Google Cloud Storage or on the
    Web."""
    from google.cloud import vision
    client = vision.ImageAnnotatorClient()
    image = vision.Image()
    image.source.image_uri = uri

    response = client.label_detection(image=image)
    labels = response.label_annotations
    
    lbs = list()
    scr = list()
    for label in labels:
        lbs.append(label.description)
        scr.append(label.score)
        
        
    return (lbs, scr)

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
        
pid = list()
for i in range(0,len(data)):
    pid.append(data[i]['postId'])

items = Counter(pid).keys()
print("No of unique items in the list are:", len(items))

url = list()
for i in range(0, len(data)):
    url.append(data[i]['imgUrl'])

items = Counter(url).keys()
print("No of unique items in the list are:", len(items))

In [None]:
url = list()
for i in range(0, len(data)):
    url.append(data[i]['imgUrl'])

for i in range(0, len(data)):
    lbs, scr = detect_labels_uri(url[i])
    data[i]['labels'] = lbs 
    data[i]['score'] = scr

with open('insta_label_score.json', 'w') as json_file:
    json.dump(data, json_file)

### There are 675 different posts with 893 images in total. 
### However there are only 485 unique images used throughout all the 675 posts
### This implies multiple posts have same images

**Data Preperation**

In [None]:
# reading data

f1 = open('insta_label_score.json')
dt = json.load(f1)

In [None]:
lbs = list()
scr = list()
likeCount = list()
commentCount = list()
caption = list()
for i in range(0, len(data)):
    lbs.append(dt[i]['labels'])
    scr.append(dt[i]['score'])
    likeCount.append(dt[i]['likeCount'])
    commentCount.append(dt[i]['commentCount'])
    caption.append(dt[i]['caption'])

In [None]:
df = pd.DataFrame(list(zip(url, lbs, scr, likeCount, commentCount, caption)), columns =['imgUrl','labels', 'score', 'likeCount', 'commentCount', 'caption']) 
df.head()

In [None]:
df.to_csv('img_lbs_scr.csv')

**Creating the Engagement Column(Target)**

In [None]:
df['normalizedLikeCount'] = df['likeCount']/(df['likeCount'].max())
df['normalizedCommentCount'] = df['commentCount']/(df['commentCount'].max())
df['engagementScore'] =  .4*df['normalizedLikeCount'] + .6*df['normalizedCommentCount']
df['engagement'] = df.apply(lambda x: 1 if x['engagementScore'] > df['engagementScore'].median() else 0, axis = 1)
df.head()

In [None]:
df.to_csv('img_lbs_scr_with_engagement.csv')

## Task C - Topic Modeling

In [None]:
def tokenize_text(version_desc):
    lowercase=version_desc.lower()
    text = wordnet_lemmatizer.lemmatize(lowercase)
    tokens = word_tokenizer.tokenize(text)
    return tokens

In [None]:
image_df = pd.read_csv("img_lbs_scr_with_engagement.csv")
image_df.reset_index(drop=True)
headerList = ['id', 'imgUrl', 'labels', 'score', 'likeCount', 'commentCount', 'caption', 'normalizedLikeCount', 'normalizedCommentCount', 'engagementScore', 'engagement']
image_df.to_csv("img_lbs_scr_with_engagement_mod.csv", header=headerList, index=False)
image_df = pd.read_csv("img_lbs_scr_with_engagement_mod.csv")


#checking for nulls if present any
print("Number of rows with any of the empty columns:")
print(image_df.isnull().sum().sum())
reviews_df = image_df.dropna()


id_column = input('provide the column name for id: ')
labels_column = input('provide the column name for text: ')
ntopics= input('Provide the number of latent topics: ');


word_tokenizer=RegexpTokenizer(r'\w+')
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_nltk=set(stopwords.words('english'))


vec_words = CountVectorizer(tokenizer=tokenize_text,stop_words=stopwords_nltk,decode_error='ignore')
total_features_words = vec_words.fit_transform(reviews_df[labels_column])

print(total_features_words.shape)

model = lda.LDA(n_topics=int(ntopics), n_iter=500, random_state=1)
lda_train = model.fit(total_features_words)

topic_word = model.topic_word_
doc_topic=model.doc_topic_
doc_topic=pd.DataFrame(doc_topic)
image_df = image_df.join(doc_topic)
doc_topic_df = pd.DataFrame()

for i in range(int(ntopics)):
    topic="topic_"+str(i)
    doc_topic_df[topic]=reviews_df.groupby([id_column])[i].mean()

topic_words_df = doc_topic_df.reset_index()
topics = pd.DataFrame(topic_word)
topics.columns = vec_words.get_feature_names()
topics1 = topics.transpose()


print ("Topics word distribution written in file topic_word_dist.xlsx ")
topics1.to_excel("topic_word_dist.xlsx")
doc_topic_df.to_excel("document_topic_dist.xlsx",index=False)
print ("Document topic distribution written in file document_topic_dist.xlsx ")

In [None]:
doc_topic_df.head(2)

In [None]:
df_nat_final = pd.concat([reviews_df.reset_index(drop=True), doc_topic_df.reset_index(drop=True)], axis=1)


q1=np.percentile(df_nat_final.engagementScore, 25) 
q2=np.percentile(df_nat_final.engagementScore, 50)  
q3=np.percentile(df_nat_final.engagementScore, 75)

top_quartile=df_nat_final[df_nat_final['engagementScore']>q3]
top_quartile.count()

average_topic_weights_top = top_quartile[["topic_0",'topic_1',"topic_2",'topic_3',"topic_4",'topic_5',"topic_6",'topic_7', 'topic_8', 'topic_9']].mean(axis=0)

bottom_quartile=df_nat_final[df_nat_final['engagementScore']<q1]
bottom_quartile.count()

average_topic_weights_bot = bottom_quartile[["topic_0",'topic_1',"topic_2",'topic_3',"topic_4",'topic_5',"topic_6",'topic_7', 'topic_8', 'topic_9']].mean(axis=0)

quartile_topics = pd.concat([average_topic_weights_top,average_topic_weights_bot],axis=1)
quartile_topics.columns = ['Top Quartile','Bottom Quartile']

**Modeling**

**Reading the Labeled data**

In [None]:
df = pd.read_csv('img_lbs_scr_with_engagement.csv') \
       .drop(['Unnamed: 0'], axis=1)

df['imgUrl'] = df['imgUrl'].apply(lambda x: x.split('media')[0])
print(df.columns)
print(df.shape)


df2 = pd.DataFrame(columns=['img_url', 'description'])
# Opening JSON file
f = open('insta_post1.json')  
# returns JSON object as  a dictionary
data = json.load(f)
# Iterating through the json list
for descr in data:
    df2 = df2.append({'img_url': descr['postUrl'], 'description': descr['description'] }, ignore_index=True)
f.close()

print(df.shape, df2.shape)

descr_lookup = df2.set_index('img_url').to_dict()['description']
df['description'] = df.apply(lambda row: descr_lookup[row['imgUrl']], axis=1)

**Processing the data**

In [None]:
stemmer = SnowballStemmer("english")
stop = stopwords.words('english')

def remove_punctuation(s):
    """
    function to remove punctuations in a 
    sentence
    """
    string_punctuation = '''()-[]{};:'"\,<>./?@#$%^&*_~1234567890'''
    no_punct = ""
    for letter in s:
        if letter not in string_punctuation:
            no_punct += letter
    return no_punct

# Define function to lemmatize each word with its POS tag

# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
    
def lemmetization(pos_tagged_words):
    lemmatized_sentence = []
    for word, tag in pos_tagged_words:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    lemmatized_sentence = " ".join(lemmatized_sentence)
    return lemmatized_sentence

    
    
    
# removing unnecessary characters
df['description'] = df['description'].apply(lambda x: x.lower().split('|')[-1] \
                                                       .replace('\n', '') \
                                                       .replace('“', '') \
                                                       .replace('”', ''))
# removing punctuations in a sentence
i = 0
for row in df['description']:
    df.iloc[ i, -1] = remove_punctuation(row)
    i=i+1
    
df['description'] = df['description'].apply(word_tokenize)
df['description'] = df['description'].apply(lambda x : [w for w in x if w not in stop])

# df['description'] = df['description'].apply(lambda x: [stemmer.stem(y) for y in x])
# df['description'] = df['description'].apply(lambda x : " ".join(x))

lemmatizer = WordNetLemmatizer()
df['description'] = df['description'].apply(lambda x: nltk.pos_tag(x))
df['description'] = df['description'].apply(lambda words: list(map(lambda x: (x[0], pos_tagger(x[1])), words)))
df['description'] = df['description'].apply(lambda pos_tagged_words: lemmetization(pos_tagged_words))


Text_Column = list(df['description'].copy())
sklearn_tfidf = TfidfVectorizer(min_df=.01, max_df =.95, stop_words="english",use_idf=True, smooth_idf=False, sublinear_tf=True)
sklearn_representation = sklearn_tfidf.fit_transform(Text_Column)
Tfidf_Output = pd.DataFrame(sklearn_representation.toarray(), columns=sklearn_tfidf.get_feature_names())

final_data = pd.concat([Tfidf_Output, df['engagement']], axis=1)

In [None]:
#Plotting Confusion Matrix
def show_confusion_matrix(C,class_labels=['0','1']):

    assert C.shape == (2,2), "Confusion matrix should be from binary classification only."

    # true negative, false positive, etc...
    tn = C[0,0]; fp = C[0,1]; fn = C[1,0]; tp = C[1,1];

    NP = fn+tp # Num positive examples
    NN = tn+fp # Num negative examples
    N  = NP+NN

    fig = plt.figure(figsize=(8,8))
    ax  = fig.add_subplot(111)
    ax.imshow(C, interpolation='nearest', cmap=plt.cm.gray)

    # Draw the grid boxes
    ax.set_xlim(-0.5,2.5)
    ax.set_ylim(2.5,-0.5)
    ax.plot([-0.5,2.5],[0.5,0.5], '-k', lw=2)
    ax.plot([-0.5,2.5],[1.5,1.5], '-k', lw=2)
    ax.plot([0.5,0.5],[-0.5,2.5], '-k', lw=2)
    ax.plot([1.5,1.5],[-0.5,2.5], '-k', lw=2)

    # Set xlabels
    ax.set_xlabel('Predicted Label', fontsize=16)
    ax.set_xticks([0,1,2])
    ax.set_xticklabels(class_labels + [''])
    ax.xaxis.set_label_position('top')
    ax.xaxis.tick_top()
    # These coordinate might require some tinkering. Ditto for y, below.
    ax.xaxis.set_label_coords(0.34,1.06)

    # Set ylabels
    ax.set_ylabel('True Label', fontsize=16, rotation=90)
    ax.set_yticklabels(class_labels + [''],rotation=90)
    ax.set_yticks([0,1,2])
    ax.yaxis.set_label_coords(-0.09,0.65)


    # Fill in initial metrics: tp, tn, etc...
    ax.text(0,0,
            '%d'%(tn),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(0,1,
            '%d'%fn,
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(1,0,
            '%d'%fp,
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))


    ax.text(1,1,
            '%d'%(tp),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    # Fill in secondary metrics: accuracy, true pos rate, etc...
    ax.text(2,0,
            'Error: %.2f'%(fp / (fp+tn+0.)),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(2,1,
            'Error: %.2f'%(fn / (tp+fn+0.)),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(2,2,
            'Accuracy: %.2f'%((tp+tn+0.)/N),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(0,2,' ',
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(1,2,
            ' ',
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))


    plt.tight_layout()

**With Description**

In [None]:
selected_features = [col for col in final_data.columns if col not in ['engagement']]
X = final_data[selected_features]
Y = final_data['engagement']

#Logit Regression
classifier = LogisticRegression()
Y_pred = model_selection.cross_val_predict(classifier, X, Y, cv=5)

# calculating CV accuracy
cv_scores = cross_val_score(classifier, X, Y, cv=5)
print("CV  accuracy", sum(cv_scores) / len(cv_scores))

# calculating CV roc_roc
cv_scores = cross_val_score(classifier, X, Y, cv=5, scoring='roc_auc')
print("CV auc score", sum(cv_scores) / len(cv_scores))

# calculating confusion matrix
cm = confusion_matrix(Y, Y_pred)
show_confusion_matrix(cm, ['0', '1'])
plt.show()
plt.savefig('Confusion.png')

**With Image Labels**

In [None]:
df['labels']=df['labels'].str.split(',').apply(lambda x : ("".join(x)).replace("'",''))
df['labels']=df['labels'].str[1:-1]
Text_Column=df['labels']
sklearn_tfidf = TfidfVectorizer(min_df=.01, max_df =.95, stop_words="english",use_idf=True, smooth_idf=False, sublinear_tf=True)
sklearn_representation = sklearn_tfidf.fit_transform(Text_Column)
Tfidf_Output = pd.DataFrame(sklearn_representation.toarray(), columns=sklearn_tfidf.get_feature_names())


Input = pd.concat([df, Tfidf_Output], axis=1)
Input=Input.drop(['imgUrl', 'labels', 'score', 'likeCount', 'commentCount', 'caption',
       'normalizedLikeCount', 'normalizedCommentCount', 'engagementScore','description'], axis=1)

X = Input.loc[:, Input.columns != 'engagement']
Y = Input['engagement']
classifier = LogisticRegression()
Y_pred = cross_val_predict(classifier, X, Y, cv=5)
confusion_matrix = confusion_matrix((np.array(Y)), Y_pred)

show_confusion_matrix(confusion_matrix, ['0', '1'])
plt.show()
plt.savefig('Confusion_matrix_for_Description_model.png')

print(roc_auc_score(Y, Y_pred))

**using both labels and description**

In [None]:
df3=pd.read_csv('data_for_caption_logit.csv')
df3=df3.drop(['Unnamed: 0','engagement'], axis=1)

Input2= pd.concat([Input, df3], axis=1)
X2 = Input2.loc[:, Input2.columns != 'engagement']
Y2 = Input2['engagement']
classifier = LogisticRegression()
Y_pred2 = model_selection.cross_val_predict(classifier, X2, Y2, cv=5)
confusion_matrix = confusion_matrix((np.array(Y2)), Y_pred2)
print(confusion_matrix)

show_confusion_matrix(confusion_matrix, ['0', '1'])
plt.show()
plt.savefig('Confusion_for_model_with_descr_imglabels.png')