In [1]:
import json
import nltk
from textblob import TextBlob
import glob
import textstat
import re
import math
import demoji
from nltk.sentiment import SentimentIntensityAnalyzer
import collections
from collections import Counter, defaultdict
from nltk.corpus import stopwords

In [2]:
def compute_entropy(data, unit='natural'): #entropy
    base = {
        'shannon' : 2.,
        'natural' : math.exp(1),
        'hartley' : 10.
    }

    if len(data) <= 1:
        return 0

    counts = Counter()

    for d in data:
        counts[d] += 1

    ent = 0

    probs = [float(c) / len(data) for c in counts.values()]
    for p in probs:
        if p > 0.:
            ent -= p * math.log(p, base[unit])

    return ent

In [3]:
def clean_comment(text):
    clean_text = text.lower() #lowercase
    clean_text = re.sub(r"<[^>]+>", '', clean_text, flags=re.MULTILINE)
    clean_text = re.sub(r"&[^;]+;", '', clean_text, flags=re.MULTILINE)
    clean_text = re.sub(r"\n", ' ', clean_text, flags=re.MULTILINE)
    clean_text = demoji.replace_with_desc(clean_text)
    
    stops = set(stopwords.words("english")) | set(stopwords.words("spanish")) | set(stopwords.words("dutch")) | set(stopwords.words("french")) | set(stopwords.words("german")) | set(stopwords.words("italian")) | set(stopwords.words("portuguese")) | set(stopwords.words("romanian"))
    
    words = clean_text.split(" ")
    filtered_words = [word for word in words if word not in stops and word.isalpha()]
    clean_text = ' '.join(filtered_words)
    
    clean_text = ''.join([y for y in clean_text if str.isalnum(y) or y == " "])
        
    return clean_text

In [4]:
def is_relevant_user(occupation):
    words = ["fot","phot","valokuv","zdjęcie","dealbh","bild","grianghraf", "nuotrauk","pictur","myndin","billed","ljósmyndari","ritratt"]
    return any(w in occupation.lower() for w in words)

In [5]:
with open('user_features.json',"r", encoding="utf8") as f:
    user_features = json.load(f)
user_features[0].keys()

dict_keys(['nsid', 'ispro', 'occupation', 'following_n', 'photo_count', 'join_date', 'website', 'profile_description', 'groups', 'groups_n', 'is_photographer', 'following'])

In [6]:
for user in user_features:
    user['is_photographer'] = is_relevant_user(user['occupation'])

In [7]:
#save a new file with one additional secondary feature for users
with open('user_features_secondary.json', "w") as outfile:
    json.dump(user_features,outfile)

In [8]:
with open('user_features_secondary.json',"r", encoding="utf8") as f:
    user_features = json.load(f)
user_features[0].keys()

dict_keys(['nsid', 'ispro', 'occupation', 'following_n', 'photo_count', 'join_date', 'website', 'profile_description', 'groups', 'groups_n', 'is_photographer', 'following'])

In [9]:
data = []
for filename in glob.glob('photo_features\\*'):
    #simple_name = filename.split("\\")[-1][:-5]
    with open(filename,"r", encoding="utf8") as f:
        data.extend(json.load(f))
print(data[0].keys())
print(len(data))

dict_keys(['id', 'owner', 'title', 'description', 'views', 'dateuploaded', 'lastupdate', 'tags', 'comments', 'favorites', 'exif', 'groups', 'width_o', 'height_o', 'width_downloaded', 'height_downloaded', 'kong_score', 'nima_score', 'nima_tech_score', 'comments_n', 'favorites_n', 'groups_n'])
2647927


In [None]:
n_comments = 0
n_final_comments = 0
for row_by_img in data:
    
    #delete empty comments
    filtered_comments = []
    n_comments += len(row_by_img["comments"])
    row_by_img["comments"] = [c for c in row_by_img["comments"] if c["comment"]]
    for comment in row_by_img["comments"]:        
        comment["comment"] = clean_comment(comment["comment"])
        comment_text = comment["comment"]
        if comment_text:
            filtered_comments.append(comment)
    row_by_img["comments"] = filtered_comments
    n_final_comments += len(filtered_comments)

In [11]:
default_subj_pola = 0.5
default_read = 0
sia = SentimentIntensityAnalyzer()

for x in data:
    comments = [y["comment"] for y in x["comments"] if y["comment"]]
    avg_subj = default_subj_pola
    avg_diff_words = default_read
    avg_read_time = default_read
    avg_entropy = default_read
    avg_length = default_read
    avg_polarity = default_read
    sum_subj = 0
    sum_diff_words = 0
    sum_read_time = 0
    sum_entropy = 0
    sum_length = 0
    sum_polarity = 0
    
    for comment in comments:
        testimonial = TextBlob(comment)
        sum_subj += testimonial.sentiment.subjectivity
        sum_diff_words += textstat.difficult_words(comment)
        sum_read_time += textstat.reading_time(comment, ms_per_char=14.69)
        sum_entropy += compute_entropy(comment)
        sum_length += len(comment)
        sum_polarity += sia.polarity_scores(comment).get("compound")

    if comments:
        avg_subj = sum_subj / len(comments)
        avg_diff_words = sum_diff_words / len(comments)
        avg_read_time = sum_read_time / len(comments)
        avg_entropy = sum_entropy / len(comments)
        avg_length = sum_length / len(comments)
        avg_polarity = sum_polarity / len(comments)
        
    x["avg_subj"] = avg_subj
    x["avg_diff_words"] = avg_diff_words
    x["avg_read_time"] = avg_read_time
    x["avg_entropy"] = avg_entropy
    x["avg_length"] = avg_length
    x["avg_polarity"] = avg_polarity

In [12]:
#save new files with secondary features for photos
photos_by_user = defaultdict(list)
for x in data:
    photos_by_user[x["owner"]].append(x)
    
for user, rows in photos_by_user.items():
    path = f'photo_features_secondary\\{user}.json'
    with open(path,"w") as outfile:
        json.dump(rows,outfile)