In [1]:
import sys
import os
import json
import string
import pprint
import csv
import re
from datetime import datetime
import math
import numpy as np

import networkx as nx

company_list = []
id_list = []
with open('SP500.csv') as SP500csv:
    reader = csv.DictReader(SP500csv)
    for row in reader:
        company_list.append(row['Symbol'])

buy_keywords = []
sell_keywords = []
with open('keyword.csv') as keyword:
    reader = csv.DictReader(keyword)
    for row in reader:
        buy_keywords.append(row['Buy'])
        if row['Sell']:
            sell_keywords.append(row['Sell'])

def generateDiGraph(node_ids, edge_data):
    # Generate network graph
    MG = nx.MultiDiGraph()

    for n in node_ids:
        MG.add_node(n)

    for d in edge_data:
        MG.add_edge(d['source'], d['dest'], key=d['post_id'])

    G = nx.DiGraph()
    for (u, v) in MG.edges():
        G.add_edge(u, v, weight=len(MG[u][v]))
    return G            

def calculate_authority(G):
    hubs, authorities = nx.hits(G)
    return authorities

attribute_list = ['n_tweet', 'n_stock_tweet', 'n_sentiment_tweet', 'statuses_count', 'followers_count', 'friends_count',
                 'authority_score', 'reputation_score']
                 #,'avg_quote', 'avg_stock_quote', 'avg_sentiment_quote', 'avg_reply', 'avg_stock_reply', 'avg_sentiment_reply',
                 #'avg_retweet', 'avg_stock_retweet', 'avg_sentiment_retweet','avg_favorite', 'avg_stock_favorite', 'avg_sentiment_favorite']
#attributes = len(attribute_list)

def generateUserAttributes(year, month, source_dir, attribute_list):
    attributes = len(attribute_list)
    
    with open(os.path.join("Extra_Storage", source_dir, year, month, "id_list.txt" )) as id_list:
        reader = id_list.readlines()
        idList = [x.strip() for x in reader]
        n_authors = len(idList)
        user_attributes = [0] * n_authors
        for i in range(n_authors):
            user_attributes[i] = [0] * attributes

    path = os.path.join("Extra_Storage", source_dir, year, month)
    node_ids = []
    edge_data = []

    for dirPath, dirNames, fileNames in os.walk(path):
            for file in fileNames:
                if file.endswith(".json"):    
                        print(file)
                        filepath = os.path.join(dirPath, file)
                        with open(filepath) as json_data:
                            data=[]
                            for line in json_data:
                                try:
                                    data.append(json.loads(line))
                                except json.JSONDecodeError:
                                    continue
                            for tweet in data:
                                if 'text' in tweet:
                                    if 'user' in tweet:
                                        idIndex = idList.index(str(tweet['user']['id']))                                    
                                    else:
                                        break

                                    # Update statuses_count
                                    if tweet['user']['statuses_count'] > user_attributes[idIndex][attribute_list.index('statuses_count')]:
                                        user_attributes[idIndex][attribute_list.index('statuses_count')] = tweet['user']['statuses_count']
                                    # Update followers_count
                                    if tweet['user']['followers_count'] > user_attributes[idIndex][attribute_list.index('followers_count')]:
                                        user_attributes[idIndex][attribute_list.index('followers_count')] = tweet['user']['followers_count']
                                    # Update friends_count
                                    if tweet['user']['friends_count'] > user_attributes[idIndex][attribute_list.index('friends_count')]:
                                        user_attributes[idIndex][attribute_list.index('friends_count')] = tweet['user']['friends_count']

    #                                # Update avg_quote
    #                                if 'quote_count' in tweet:
    #                                    user_attributes[idIndex][attribute_list.index('avg_quote')] += tweet['quote_count']
    #                                # Update avg_reply
    #                                if 'reply_count' in tweet:
    #                                    avgreply = (user_attributes[idIndex][attribute_list.index('n_tweet')]*user_attributes[idIndex][attribute_list.index('avg_reply')]  
    #                                    + tweet['reply_count'])/(user_attributes[idIndex][attribute_list.index('n_tweet')]+1)
    #                                    user_attributes[idIndex][attribute_list.index('avg_reply')] = avgreply
    #                                # Update avg_retweet
    #                                if 'retweet_count' in tweet:
    #                                    avgretweet = (user_attributes[idIndex][attribute_list.index('n_tweet')]*user_attributes[idIndex][attribute_list.index('avg_retweet')]  
    #                                    + tweet['retweet_count'])/(user_attributes[idIndex][attribute_list.index('n_tweet')]+1)
    #                                    user_attributes[idIndex][attribute_list.index('avg_retweet')] = avgretweet
    #                                    #user_attributes[idIndex][attribute_list.index('avg_retweet')] += tweet['retweet_count']
    #                                # Update avg_favorite  
    #                                if 'favorite_count' in tweet:
    #                                    user_attributes[idIndex][attribute_list.index('avg_favorite')] += tweet['favorite_count']

                                    if 'quoted_status' in tweet:
                                        if not tweet['user']['id'] in node_ids:
                                            node_ids.append(tweet['user']['id'])
                                        if not tweet['quoted_status']['user']['id'] in node_ids:
                                            node_ids.append(tweet['quoted_status']['user']['id'])
                                        edge_data.append({'source':tweet['user']['id'], 'dest': tweet['quoted_status']['user']['id'], 'post_id': tweet['id']})
                                    if 'retweeted_status' in tweet:
                                        if not tweet['user']['id'] in node_ids:
                                            node_ids.append(tweet['user']['id'])
                                        if not tweet['retweeted_status']['user']['id'] in node_ids:
                                            node_ids.append(tweet['retweeted_status']['user']['id'])
                                        edge_data.append({'source':tweet['user']['id'], 'dest': tweet['retweeted_status']['user']['id'], 'post_id': tweet['id']})


                                    user_attributes[idIndex][attribute_list.index('n_tweet')]+=1     #Number of all tweets plus 1
                                    tweet_lower = tweet['text'].lower()
                                    company_index = -1
                                    for company in company_list:
                                        company_index+=1
                                        company_tag = ' $' + company + ' '
                                        if company_tag.lower() in tweet_lower:
                                            # Update avg_stock_quote
    #                                        if 'quote_count' in tweet:
    #                                            avgquote = (user_attributes[idIndex][attribute_list.index('n_stock_tweet')]*user_attributes[idIndex][attribute_list.index('avg_stock_quote')]  \
    #                                            + tweet['quote_count'])/(user_attributes[idIndex][attribute_list.index('n_stock_tweet')]+1)
    #                                            user_attributes[idIndex][attribute_list.index('avg_stock_quote')] = avgquote                                
    #                                        # Update avg_stock_reply
    #                                        if 'reply_count' in tweet:
    #                                            avgreply = (user_attributes[idIndex][attribute_list.index('n_stock_tweet')]*user_attributes[idIndex][attribute_list.index('avg_stock_reply')]  \
    #                                            + tweet['reply_count'])/(user_attributes[idIndex][attribute_list.index('n_stock_tweet')]+1)
    #                                            user_attributes[idIndex][attribute_list.index('avg_stock_reply')] = avgreply
    #                                        # Update avg_stock_retweet
    #                                        if 'retweet_count' in tweet:
    #                                            avgretweet = (user_attributes[idIndex][attribute_list.index('n_tweet')]*user_attributes[idIndex][attribute_list.index('avg_stock_retweet')]  
    #                                            + tweet['retweet_count'])/(user_attributes[idIndex][attribute_list.index('n_stock_tweet')]+1)
    #                                            user_attributes[idIndex][attribute_list.index('avg_stock_retweet')] = avgretweet
    #                                        # Update avg_stock_favorite
    #                                        if 'favorite_count' in tweet:
    #                                            avgfavorite = (user_attributes[idIndex][attribute_list.index('n_stock_tweet')]*user_attributes[idIndex][attribute_list.index('avg_stock_favorite')]  
    #                                            + tweet['favorite_count'])/(user_attributes[idIndex][attribute_list.index('n_stock_tweet')]+1)
    #                                            user_attributes[idIndex][attribute_list.index('avg_stock_favorite')] = avgfavorite

                                            user_attributes[idIndex][attribute_list.index('n_stock_tweet')]+=1     #Number of stock-related tweets plus 1
                                            sentiment = False
                                            for buy_keyword in buy_keywords:
                                                if buy_keyword in tweet_lower:
                                                    sentiment = True
                                                    break
                                            for sell_keyword in sell_keywords:
                                                if sell_keyword in tweet_lower:
                                                    sentiment = True
                                                    break
                                            if sentiment:

    #                                            # Update avg_sentiment_quote
    #                                            if 'quote_count' in tweet:
    #                                                avgquote = (user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]*user_attributes[idIndex][attribute_list.index('avg_sentiment_quote')]  
    #                                                + tweet['quote_count'])/(user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]+1)
    #                                                user_attributes[idIndex][attribute_list.index('avg_sentiment_quote')] = avgquote                                
    #                                            # Update avg_sentiment_reply     
    #                                            if 'reply_count' in tweet:
    #                                                avgreply = (user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]*user_attributes[idIndex][attribute_list.index('avg_sentiment_reply')]  
    #                                                + tweet['reply_count'])/(user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]+1)
    #                                                user_attributes[idIndex][attribute_list.index('avg_sentiment_reply')] = avgreply
    #                                            # Update avg_sentiment_retweet
    #                                            if 'retweet_count' in tweet:
    #                                                avgretweet = (user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]*user_attributes[idIndex][attribute_list.index('avg_sentiment_retweet')]  
    #                                                + tweet['retweet_count'])/(user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]+1)
    #                                                user_attributes[idIndex][attribute_list.index('avg_sentiment_retweet')] = avgretweet
    #                                            # Update avg_sentiment_favorite 
    #                                            if 'favorite_count' in tweet:
    #                                                avgfavorite = (user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]*user_attributes[idIndex][attribute_list.index('avg_sentiment_favorite')]  
    #                                                + tweet['favorite_count'])/(user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]+1)
    #                                                user_attributes[idIndex][attribute_list.index('avg_sentiment_favorite')] = avgfavorite

                                                user_attributes[idIndex][attribute_list.index('n_sentiment_tweet')]+=1    #Number of sentiment-related tweets plus 1
                                            break

    G = generateDiGraph(node_ids, edge_data)

    hubs, authority_scores = nx.hits(G, max_iter=2000, tol=1.0e-6 )
    for node, value in authority_scores.items():
        if str(node) in idList:
            idIndex = idList.index(str(node))
            user_attributes[idIndex][attribute_list.index('authority_score')] = value

    reputation_scores = nx.pagerank(G)
    for node, value in reputation_scores.items():
        if str(node) in idList:
            idIndex = idList.index(str(node))
            user_attributes[idIndex][attribute_list.index('reputation_score')] = value

    arr = np.array(user_attributes)
    savepath = os.path.join("Extra_Storage", source_dir, year, month,'user_attributes_' + year + '_' + month + '.csv' )
    #np.savetxt(savepath, arr, fmt='%1.5f', delimiter=',')
    attributes = ','.join(attribute_list)
    np.savetxt(savepath, arr, header=attributes, delimiter=',')

#generateUserAttributes('2016', '03', 'Archiveteam_Filtered', attribute_list)

In [2]:
#Preprocess and generate weighting for sentiments by users
from sklearn import linear_model

def preprocessing(year, month, source_dir):
    path = os.path.join("Extra_Storage", source_dir, year, month)

    preprocessedATT = []
    with open(os.path.join(path, "user_attributes_"+ year + "_" + month + ".csv" )) as tweet_attributes:
        reader = csv.DictReader(tweet_attributes)
        for row in reader:
            tempATT = []
            if(float(row['# n_tweet'])!=0):
                tempATT.append(float(row['n_stock_tweet'])/float(row['# n_tweet'])) #expertise
            else:
                tempATT.append(0)
            #tempATT.append(float(row['n_stock_tweet']))
            #tempATT.append(float(row['n_sentiment_tweet']))
            #tempATT.append(float(row['statuses_count']))
            #tempATT.append(float(row['followers_count']))
            #tempATT.append(float(row['friends_count']))
            if(float(row['authority_score'])==0):
                tempATT.append(0)
            else:
                #tempATT.append(math.log10(float(row['authority_score']))+100)
                tempATT.append(float(row['authority_score']))
            if(float(row['reputation_score'])==0):
                tempATT.append(0)
            else:
                #tempATT.append(math.log10(float(row['reputation_score']))+50)
                tempATT.append(float(row['reputation_score']))
            preprocessedATT.append(tempATT)
        
        avg_expertise = np.array(preprocessedATT).mean(axis=0)[0]
        for item in preprocessedATT:
            item.append(1-abs(avg_expertise - item[0])) #experience
        #temp = np.array(preprocessedATT).min(axis=0)
        #for item in preprocessedATT:
        #    item[6] = item[6] + temp[6]
        #    item[7] = item[7] + temp[7]
        
    arr = np.array(preprocessedATT)
    savepath = os.path.join("Extra_Storage", source_dir, year, month,'trust_scores_'+ year + '_' + month + '.csv' )
    #filter_names = 'expertise_score,n_stock_tweet,n_sentiment_tweet,statuses_count,followers_count,friends_count,authority_score,reputation_score,experience_score'
    filter_names = 'expertise_score,authority_score,reputation_score,experience_score'
    np.savetxt(savepath, arr, header=filter_names, delimiter=',', comments='')
    return arr

#ATT = preprocessing('2016', '03')

In [4]:
#generateUserAttributes('2016', '05', 'ArchiveteamTest', attribute_list)

#generateUserAttributes('2016', '04', 'ArchiveteamTest', attribute_list)
#ATT = preprocessing('2016', '04', 'ArchiveteamTest')
generateUserAttributes('2016', '03', 'ArchiveteamTest', attribute_list)
ATT = preprocessing('2016', '03', 'ArchiveteamTest')
generateUserAttributes('2016', '02', 'ArchiveteamTest', attribute_list)
ATT = preprocessing('2016', '02', 'ArchiveteamTest')
generateUserAttributes('2016', '01', 'ArchiveteamTest', attribute_list)
ATT = preprocessing('2016', '01', 'ArchiveteamTest')
generateUserAttributes('2015', '12', 'ArchiveteamTest', attribute_list)
ATT = preprocessing('2015', '12', 'ArchiveteamTest')
generateUserAttributes('2015', '11', 'ArchiveteamTest', attribute_list)
ATT = preprocessing('2015', '11', 'ArchiveteamTest')

01.json
03.json
08.json
10.json
11.json
12.json
13.json
14.json
16.json
17.json
18.json
19.json
20.json
21.json
22.json
24.json
25.json
26.json
27.json
28.json
29.json
30.json
31.json
01.json
02.json
03.json
04.json
05.json
06.json
07.json
08.json
09.json
14.json
15.json
16.json
17.json
18.json
19.json
20.json
21.json
22.json
23.json
24.json
25.json
26.json
01.json
02.json
03.json
04.json
05.json
06.json
07.json
08.json
09.json
10.json
11.json
12.json
13.json
14.json
15.json
16.json
17.json
18.json
19.json
20.json
21.json
22.json
23.json
24.json
25.json
26.json
27.json
28.json
29.json
30.json
31.json
01.json
02.json
03.json
04.json
05.json
15.json
16.json
17.json
18.json
19.json
20.json
21.json
22.json
23.json
24.json
25.json
26.json
27.json
28.json
29.json
31.json
02.json
03.json
04.json
05.json
06.json
07.json
08.json
09.json
10.json
11.json
12.json
13.json
14.json
15.json
17.json
18.json
19.json
20.json
21.json
22.json
30.json


In [9]:
def combineFilters(source_dir, source_year_start, source_month_start, source_year_end, source_month_end, target_year, target_month):
    y=[]
    X=[[], [], [], [] ]
    
    for year_int in range(int(source_year_start), int(source_year_end)+1):
        if (year_int == int(source_year_end)):
            last_month = int(source_month_end)            
        else:
            last_month = 12
        if(year_int == int(source_year_start)):
            first_month = int(source_month_start)
        else:
            first_month = 1
        cur_year = str(year_int)
        
        for month_int in range(first_month, last_month+1):
            if(month_int<10):
                cur_month = '0'+ str(month_int)
            else:
                cur_month = str(month_int)
                
            path = os.path.join("Extra_Storage", source_dir, cur_year, cur_month)
            ROI_list = []
            with open(os.path.join(path, "Acc_ROI_all_"+ cur_year + "_" + cur_month + ".csv")) as ROIcsv:
                reader = csv.reader(ROIcsv)
                for row in reader:
                    ROI_list.append(float(row[0]))

            y.extend(ROI_list)

            ATT = preprocessing(cur_year, cur_month, source_dir)
            X[0].extend(list(ATT[:,0]))
            X[1].extend(list(ATT[:,1]))
            X[2].extend(list(ATT[:,2]))
            X[3].extend(list(ATT[:,3]))
            
    regr = linear_model.LinearRegression()
    regr.fit(np.array(X).transpose(), y)
    print(regr.coef_)
    
    ATT = preprocessing(target_year, target_month, source_dir)
    X_new = [list(ATT[:,0]),list(ATT[:,1]),list(ATT[:,2]),list(ATT[:,3])]
    Y_new = regr.predict(np.array(X_new).transpose())
    
    savepath = os.path.join("Extra_Storage", source_dir, target_year, target_month,'predicted_scores_'+ target_year + '_' + target_month + '_from_' + source_year_start + '_' + source_month_start + '_to_' + source_year_end + '_' + source_month_end + '.csv' )
    np.savetxt(savepath, Y_new, delimiter=',', comments='', header = 'combined_score')

#combineFilters("Archiveteam_Halfyear", "2016", "03", "2016", "05")
combineFilters("Archiveteam_Halfyear", "2015", "11", "2016", "04", "2016", "05")

[  6.45543152e-01  -1.00942015e-01   3.05676053e+02  -2.48605762e-01]


In [27]:
import sys
import os
import json
import string
import pprint
import csv
import re
from datetime import datetime
import math

company_list = []
id_list = []
with open('SP500.csv') as SP500csv:
    reader = csv.DictReader(SP500csv)
    for row in reader:
        company_list.append(row['Symbol'])

buy_keywords = []
sell_keywords = []
with open('keyword.csv') as keyword:
    reader = csv.DictReader(keyword)
    for row in reader:
        buy_keywords.append(row['Buy'])
        if row['Sell']:
            sell_keywords.append(row['Sell'])

attributes = 3
with open(os.path.join("Extra_Storage", "Archiveteam_Filtered", "2016", "04", "id_list.txt" )) as id_list:
    reader = id_list.readlines()
    idList = [x.strip() for x in reader]
    n_authors = len(idList)
    user_attributes = [0] * n_authors
    for i in range(n_authors):
        user_attributes[i] = [0] * attributes

path = os.path.join("Extra_Storage", "Archiveteam_Filtered", "2016", "04")
#for run_date in range(1,30):
#    if run_date < 10:
#        sub_path = os.path.join(path, "0" + str(run_date))
#    else:
#        sub_path = os.path.join(path, str(run_date))
        
for dirPath, dirNames, fileNames in os.walk(path):
        for file in fileNames:
            if file.endswith(".json"):    
                    filepath = os.path.join(dirPath, file)
                    with open(filepath) as json_data:
                        data=[]
                        for line in json_data:
                            try:
                                data.append(json.loads(line))
                            except json.JSONDecodeError:
                                continue
                        for tweet in data:
                            if 'text' in tweet:
                                if 'user' in tweet:
                                    idIndex = idList.index(str(tweet['user']['id']))
                                    #print(idIndex)
                                else:
                                    break
                                user_attributes[idIndex][0]+=1     #Number of all tweets plus 1
                                tweet_lower = tweet['text'].lower()
                                company_index = -1
                                for company in company_list:
                                    company_index+=1
                                    company_tag = ' $' + company + ' '
                                    if company_tag.lower() in tweet_lower:
                                        user_attributes[idIndex][1]+=1     #Number of stock-related tweets plus 1
                                        sentiment = False
                                        for buy_keyword in buy_keywords:
                                            if buy_keyword in tweet_lower:
                                                sentiment = True
                                                break
                                        for sell_keyword in sell_keywords:
                                            if sell_keyword in tweet_lower:
                                                sentiment = True
                                                break
                                        if sentiment:
                                            user_attributes[idIndex][2]+=1    #Number of sentiment-related tweets plus 1
                                        break

In [44]:
import numpy as np
arr = np.array(user_attributes)
savepath = os.path.join("Extra_Storage", "Archiveteam_Filtered", "2016", "03",'user_attributes_2016_03.csv' )
np.savetxt(savepath, arr, fmt='%1.3f', delimiter=',')

In [21]:
top_author_list = []
savepath = os.path.join("Extra_Storage", "Archiveteam_Filtered", "2016", "03" )

with open(os.path.join("Extra_Storage", "Archiveteam_Filtered", "2016", "03", "id_list.txt" )) as id_list:
    reader = id_list.readlines()
    idList = [x.strip() for x in reader]
    
with open(os.path.join(savepath, 'user_attributes_2016_03.csv')) as tweetCount:
    reader = csv.reader(tweetCount)
    i=0
    for row in reader:
        if int(row[1]) >= 20:
            #print(row[1])
            top_author_list.append(idList[i])
        i+=1

file = open(os.path.join(savepath, "Morethan20id_list.txt"), 'w')
for item in top_author_list:
    file.write("%s\n" %item)
file.close()    
    

In [12]:
import os
import csv
import json

with open(os.path.join("Extra_Storage", "Archiveteam_Filtered", "2016", "04", "Morethan20id_list.txt" )) as id_list:
    reader = id_list.readlines()
    top_author_list = [x.strip() for x in reader]

path = os.path.join("Extra_Storage", "Archiveteam_Filtered", "2016", "04")
textfile = open(os.path.join(path, "Morethan20_text.txt"), 'w')
company_list = []
with open('SP500.csv') as SP500csv:
    reader = csv.DictReader(SP500csv)
    for row in reader:
        company_list.append(row['Symbol'])

for dirPath, dirNames, fileNames in os.walk(path):
    for file in fileNames:
        if file.endswith("json"):
            filepath = os.path.join(dirPath, file)
            with open(filepath) as json_data:
                data=[]
                for line in json_data:
                    try:
                        data.append(json.loads(line))
                    except json.JSONDecodeError:
                        continue
                for tweet in data:
                    if 'text' in tweet:
                        if str(tweet['user']['id']) in top_author_list:
                            for company in company_list:
                                company_tag = ' $' + company + ' '
                                if company_tag in tweet['text']:
                                    #print(tweet['text'])
                                    textfile.write("%s\n" %tweet['text'].encode("utf-8"))
                                    #textfile.write(tweet['text']+"\n")
                                    break
textfile.close()