# TF/IDF on data stored on Hive Assignment

> CA675

Author : Shubham Rai
Student Number : 21261161

## Cleaning Data

In [None]:
import pandas as pd
df = pd.read_csv('stackexchange_export.csv', quotechar='"')
# Remove HTML Tags
df['Body'] = df['Body'].str.replace(r'<[^<>]*>', '', regex=True) 
# Keep letters
df["Body"] = df['Body'].str.replace('[^a-zA-Z]', ' ', regex=True)
# Remove new line characters
df["Body"] = df['Body'].replace(r'\n',' ', regex=True) 
# Export File to be imported in Hive
df.to_csv('/home/cypherhonk/cleaned_final.csv', sep=',', encoding='utf-8',index=None)
! chmod 755 cleaned_final.csv

## Get connection to Hive

In [81]:
from pyhive import hive
from tabulate import tabulate
import pandas as pd


host_name = "localhost"
port = 10000
user = "cypherhonk"
password = "941416156866692861"
database="default"

def hiveconnection(host_name, port, user,password, database):
    conn = hive.Connection(host=host_name, port=port, username=user, password=password,
                           database=database, auth='CUSTOM')
    return conn


conn = hiveconnection(host_name, port, user,password, database)
cur = conn.cursor()

## Usage example from https://github.com/dropbox/PyHive

# Task 2 & 3

### 2.2.1) Querying top 10 posts by score

In [13]:
cur.execute('select ID, Title, Score, DisplayName from stackexchange_view order by score desc limit 10')
result = cur.fetchall()
print(tabulate(result, tablefmt='orgtbl'))

| 11227809 | Why is processing a sorted array faster than processing an unsorted array? | 25933 | GManNickG       |
|   927358 | How do I undo the most recent local commits in Git?                        | 23348 | Hamza Yerlikaya |
|  2003505 | How do I delete a Git branch locally and remotely?                         | 18514 | Matthew Rankin  |
|   292357 | What is the difference between 'git pull' and 'git fetch'?                 | 12834 | pupeno          |
|   231767 | What does the "yield" keyword do?                                          | 11551 | Alex. S.        |
|   477816 | What is the correct JSON content type?                                     | 10921 | Oli             |
|   348170 | How do I undo 'git add' before commit?                                     | 10079 | paxos1977       |
|  5767325 | How can I remove a specific item from an array?                            |  9931 | Walker          |
|  6591213 | How do I rename a local Git branch?                        

### 2.2.2) The top 10 users by post score 

In [34]:
### Join data as usernames data was extracted and added post data pulling

cur.execute("""
    select
           OwnerUserId,
           DisplayName,
           sum(Score) as score
    from stackexchange_view
    group by OwnerUserId,DisplayName
    order by score desc
    LIMIT 10

""")
result = cur.fetchall()
print(tabulate(result, tablefmt='orgtbl'))

|  87234 | GManNickG       | 37672 |
|   4883 | readonly        | 28817 |
|   9951 | e-satis         | 26878 |
|   6068 | pupeno          | 25944 |
|  89904 | Hamza Yerlikaya | 24024 |
|  51816 | Joan Venge      | 23763 |
|  49153 | Ali             | 20203 |
| 179736 | TIMEX           | 19603 |
|  95592 | Matthew Rankin  | 19479 |
|  63051 | flybywire       | 19362 |


In [20]:
top_10_user_score = []
for each_item in result:
    top_10_user_score.append(each_item[0])
top_10_user_score

[87234, 4883, 9951, 6068, 89904, 51816, 49153, 179736, 95592, 63051]


### 2.2.3) The number of distinct users, who used the word “cloud” in one of their posts


In [36]:
cur.execute("""
SELECT
     COUNT(DISTINCT owneruserid) as user_count
FROM stackexchange_view
WHERE title LIKE '% cloud %' or Body LIKE '% cloud %'
""")
result = cur.fetchall()
print(tabulate(result, tablefmt='orgtbl'))

| 248 |


## TF/IDF

In [74]:
### Get data for top 10 users from above list and pull data

df = pd.read_sql(f"""
SELECT
       
       DISTINCT owneruserid,
       displayname,
       title,
       body
from stackexchange_view
WHERE owneruserid IN {tuple(top_10_user_score)}
order by owneruserid""", conn)



In [83]:
# Attach body and title fields to create one field to be 
df["text"] = df["title"] + df["body"]
top_10_username = list(df["displayname"].unique())
top_10_username

['readonly',
 'pupeno',
 'e-satis',
 'Ali',
 'Joan Venge',
 'flybywire',
 'GManNickG',
 'Hamza Yerlikaya',
 'Matthew Rankin',
 'TIMEX']

In [76]:
## References taken and custom function created. 
## Reference : https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a

from sklearn.feature_extraction.text import TfidfVectorizer


# Calculate sum() of TF-IDF and get top 10 words with highest TF-IDF and select only those columns
def calculate_tf_idf(df):
    vectorizer = TfidfVectorizer(stop_words='english', lowercase=True) # Remove Stop Words
    response = vectorizer.fit_transform(df["text"]) # Use title field for TF/IDF
    df_tfidf_sklearn = pd.DataFrame(response.toarray(),columns=vectorizer.get_feature_names())
    total_tf_idf = df_tfidf_sklearn.sum(axis = 0) # Remove sum of TF/IDF for getting top 10 most used words
    top_10_list = total_tf_idf.nlargest(10) # Get top 10 words per user
    top_10_words = list(top_10_list.index) # Get list of top 10 words
    df_tfidf_sklearn[top_10_words] # Select only top 10 words as column
    return df_tfidf_sklearn[top_10_words]


## For each of top 10 users and their top 10 words plot TF/IDF table for their correspnding comments

In [80]:
## For all 10 users create TF-IDF table for each user for their respective comments. 

for each_user in top_10_username: ## Iterate over all top 10 users
    filtered_data = df[(df['displayname']==each_user)] # Filter data only for selected user
    tf_idf_df = calculate_tf_idf(filtered_data) # pass on to above function
    print("For Username ID TF/IDF table : "+each_user) 
    tf_idf_df.insert(0, 'usernameid', each_user)# attach username ID field to dataframe
    display(tf_idf_df)
    print("******************************************************************************\n\n\n")
    

    


For Username ID TF/IDF table : readonly


Unnamed: 0,usernameid,python,use,list,table,branch,way,difference,git,process,rename
0,readonly,0.0,0.0,0.0,0.0,0.636117,0.0,0.0,0.202994,0.0,0.0
1,readonly,0.108905,0.239666,0.428552,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,readonly,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,readonly,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.350231,0.0
4,readonly,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203342,0.0,0.0
5,readonly,0.156136,0.0,0.0,0.0,0.0,0.0,0.220471,0.0,0.0,0.0
6,readonly,0.0,0.0,0.0,0.0,0.0,0.179925,0.0,0.0,0.0,0.0
7,readonly,0.0,0.0,0.0,0.521156,0.0,0.0,0.0,0.0,0.0,0.568902
8,readonly,0.127064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,readonly,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


******************************************************************************



For Username ID TF/IDF table : pupeno


Unnamed: 0,usernameid,file,java,android,sql,git,way,like,application,data,dump
0,pupeno,0.0,0.0,0.0,0.0,0.0,0.084849,0.0,0.0,0.0,0.0
1,pupeno,0.0,0.0,0.0,0.0,0.0,0.0,0.021477,0.029489,0.0,0.0
2,pupeno,0.0,0.0,0.0,0.0,0.0,0.085361,0.265579,0.0,0.0,0.0
3,pupeno,0.108337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.123813,0.0
4,pupeno,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,pupeno,0.187244,0.0,0.0,0.277057,0.0,0.109712,0.0,0.0,0.093622,0.0
6,pupeno,0.116987,0.0,0.0,0.692403,0.0,0.0,0.094784,0.0,0.116987,0.323682
7,pupeno,0.0,0.0,0.0,0.0,0.0,0.0,0.089201,0.0,0.0,0.0
8,pupeno,0.0,0.0,0.13015,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,pupeno,0.0,0.0,0.0,0.105631,0.0,0.0,0.0723,0.0,0.267708,0.617252


******************************************************************************



For Username ID TF/IDF table : e-satis


Unnamed: 0,usernameid,python,git,head,like,does,using,use,gt,way,know
0,e-satis,0.077636,0.0,0.0,0.131321,0.155273,0.0,0.0,0.0,0.0,0.069198
1,e-satis,0.0,0.0,0.0,0.0,0.0,0.236308,0.0,0.0,0.0,0.0
2,e-satis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064273,0.0
3,e-satis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,e-satis,0.0,0.0,0.0,0.101237,0.119702,0.0,0.0,0.0,0.0,0.0
5,e-satis,0.0,0.0,0.0,0.0,0.22774,0.0,0.11387,0.0,0.0,0.0
6,e-satis,0.0,0.070489,0.0,0.059616,0.0,0.0,0.0,0.0,0.0,0.062828
7,e-satis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,e-satis,0.0,0.0,0.0,0.034854,0.0,0.164845,0.123634,0.0,0.116494,0.146928
9,e-satis,0.0,0.075851,0.0,0.0,0.0,0.151701,0.0,0.0,0.0,0.135212


******************************************************************************



For Username ID TF/IDF table : Ali


Unnamed: 0,usernameid,gt,array,javascript,using,php,lt,id,jquery,key,file
0,Ali,0.225908,0.000000,0.000000,0.168649,0.000000,0.243885,0.000000,0.258347,0.000000,0.0
1,Ali,0.089387,0.210973,0.245229,0.000000,0.000000,0.000000,0.289498,0.102222,0.339392,0.0
2,Ali,0.000000,0.000000,0.349359,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,Ali,0.000000,0.000000,0.000000,0.174570,0.223285,0.000000,0.000000,0.000000,0.000000,0.0
4,Ali,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
74,Ali,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
75,Ali,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
76,Ali,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
77,Ali,0.022841,0.000000,0.000000,0.000000,0.000000,0.024659,0.000000,0.000000,0.000000,0.0


******************************************************************************



For Username ID TF/IDF table : Joan Venge


Unnamed: 0,usernameid,python,string,like,list,gt,class,want,value,function,index
0,Joan Venge,0.193545,0.000000,0.000000,0.132596,0.000000,0.000000,0.000000,0.0,0.0,0.00000
1,Joan Venge,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.00000
2,Joan Venge,0.000000,0.000000,0.051889,0.000000,0.000000,0.000000,0.071552,0.0,0.0,0.28987
3,Joan Venge,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.00000
4,Joan Venge,0.000000,0.000000,0.000000,0.000000,0.000000,0.358144,0.000000,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...
61,Joan Venge,0.000000,0.000000,0.153097,0.000000,0.000000,0.128018,0.000000,0.0,0.0,0.00000
62,Joan Venge,0.000000,0.000000,0.000000,0.169950,0.000000,0.000000,0.153303,0.0,0.0,0.20702
63,Joan Venge,0.344646,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.00000
64,Joan Venge,0.000000,0.000000,0.000000,0.100046,0.328351,0.000000,0.000000,0.0,0.0,0.00000


******************************************************************************



For Username ID TF/IDF table : flybywire


Unnamed: 0,usernameid,file,python,want,vs,use,java,gt,standard,instance,output
0,flybywire,0.0,0.124444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,flybywire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,flybywire,0.09927,0.111844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,flybywire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,flybywire,0.0,0.119751,0.0,0.0,0.0,0.0,0.239501,0.0,0.0,0.0
5,flybywire,0.0,0.0,0.0,0.228055,0.0,0.0,0.0,0.0,0.0,0.0
6,flybywire,0.0,0.0,0.116217,0.0,0.0,0.0,0.280514,0.0,0.0,0.0
7,flybywire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,flybywire,0.0,0.0,0.123087,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,flybywire,0.0,0.0,0.112601,0.0,0.0,0.0,0.0,0.0,0.0,0.0


******************************************************************************



For Username ID TF/IDF table : GManNickG


Unnamed: 0,usernameid,lt,copy,data,int,idiom,gt,quot,swap,arraysize,array
0,GManNickG,0.0,0.523691,0.0,0.0,0.418953,0.0,0.0,0.314215,0.0,0.079656
1,GManNickG,0.126037,0.0,0.0,0.252073,0.0,0.252073,0.252073,0.0,0.0,0.0
2,GManNickG,0.431818,0.0,0.51101,0.172727,0.0,0.129546,0.086364,0.0,0.283895,0.194318


******************************************************************************



For Username ID TF/IDF table : Hamza Yerlikaya


Unnamed: 0,usernameid,file,timer,java,new,table,bits,byte,iterate,application,files
0,Hamza Yerlikaya,0.47002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.268619,0.10447
1,Hamza Yerlikaya,0.066477,0.0,0.053078,0.37992,0.0,0.0,0.0,0.0,0.0,0.0
2,Hamza Yerlikaya,0.356757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.211455
3,Hamza Yerlikaya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084239
4,Hamza Yerlikaya,0.0,0.0,0.156744,0.0,0.0,0.527915,0.527915,0.527915,0.0,0.0
5,Hamza Yerlikaya,0.0,0.0,0.026395,0.0,0.533399,0.0,0.0,0.0,0.0,0.0
6,Hamza Yerlikaya,0.0,0.744404,0.073674,0.210937,0.0,0.0,0.0,0.0,0.0,0.0
7,Hamza Yerlikaya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25488,0.099127
8,Hamza Yerlikaya,0.0,0.0,0.284629,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Hamza Yerlikaya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


******************************************************************************



For Username ID TF/IDF table : Matthew Rankin


Unnamed: 0,usernameid,install,pip,python,lt,gt,branch,flask,version,debugger,started
0,Matthew Rankin,0.0,0.0,0.0,0.0,0.0,0.0,0.555725,0.555725,0.0,0.0
1,Matthew Rankin,0.0,0.0,0.358589,0.0,0.0,0.0,0.0,0.0,0.495842,0.495842
2,Matthew Rankin,0.0,0.0,0.0,0.0,0.043561,0.571753,0.0,0.0,0.0,0.0
3,Matthew Rankin,0.427251,0.284834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Matthew Rankin,0.0,0.0,0.0,0.650278,0.544984,0.0,0.0,0.0,0.0,0.0
5,Matthew Rankin,0.235094,0.211585,0.094038,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Matthew Rankin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Matthew Rankin,0.349045,0.319958,0.319958,0.0,0.0,0.0,0.0,0.0,0.0,0.0


******************************************************************************



For Username ID TF/IDF table : TIMEX


Unnamed: 0,usernameid,python,want,user,string,gt,dictionary,lt,use,javascript,return
0,TIMEX,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000
1,TIMEX,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000
2,TIMEX,0.000000,0.130509,0.0,0.000000,0.179279,0.0,0.187309,0.000000,0.00000,0.000000
3,TIMEX,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000
4,TIMEX,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.17604,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
110,TIMEX,0.113904,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000
111,TIMEX,0.000000,0.031423,0.0,0.090197,0.000000,0.0,0.000000,0.000000,0.00000,0.048591
112,TIMEX,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000
113,TIMEX,0.000000,0.000000,0.0,0.000000,0.423683,0.0,0.000000,0.076049,0.00000,0.000000


******************************************************************************



