# Mapping Questions into a user Profile

In [1]:
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
tags = pd.read_csv("../input/Tags_Filtered.csv", encoding='latin1', index_col = 'Id')
questions = pd.read_csv("../input/Questions_Filtered.csv",encoding="latin1", index_col = 'Id')

In [138]:
tags.sample(3)

Unnamed: 0_level_0,Tag
Id,Unnamed: 1_level_1
12184050,wpf
10025730,javascript
24535430,c++


In [139]:
questions.sample(3)

Unnamed: 0_level_0,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
37998770,5444726.0,2016-06-23T18:03:32Z,,0,found hosting page virtualhost ubuntu,trying host webpage apache ubuntu platform usi...
26391770,1910724.0,2014-10-15T20:43:23Z,,3,ontouchevent surfaceview called seconds delayed,simple game surfaceview sometimes game respond...
21265380,3211802.0,2014-01-21T17:42:53Z,,-2,hadoop grep command,one tell following grep command bin hadoop jar...


## Evaluating a user based on their Questions

In this Section, we use the data from questions and tags datasets to create two matrecies of users. The two matrices are:
* A matrix containing total Scoring statistics
* A matrix keeping track of total Questions asked by user per category

In [33]:
user_scores = pd.DataFrame(np.zeros((len(questions['OwnerUserId'].unique()), len(tags['Tag'].unique()))),index=questions['OwnerUserId'].unique(), columns=tags['Tag'].unique())
user_questions = pd.DataFrame(np.zeros((len(questions['OwnerUserId'].unique()), len(tags['Tag'].unique()))),index=questions['OwnerUserId'].unique(), columns=tags['Tag'].unique())
user_scores.index.name = 'Id'
user_questions.index.name = 'Id'
user_scores.head(2)

Unnamed: 0_level_0,flex,actionscript-3,svn,sql,asp.net,algorithm,colors,c#,.net,c++,...,meteor,laravel,firebase,parse.com,typescript,docker,apache-spark,reactjs,spring-boot,ionic-framework
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
questions[questions['OwnerUserId']== 80].set_index('Id').join(tags.set_index('Id'))

Unnamed: 0_level_0,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,Tag
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
250970,80.0,2008-10-30T17:09:55Z,,3,detect error object support property methodâ,object iâ working instantiated javascript used...,vbscript
250970,80.0,2008-10-30T17:09:55Z,,3,detect error object support property methodâ,object iâ working instantiated javascript used...,error-handling
1046810,80.0,2009-06-26T00:18:39Z,,31,using jslint notepad,seen text editors use extensions allow syntax ...,javascript


Number of users we will begin dealing with

In [35]:
print(len(user_scores.index.unique()),len(user_questions.index.unique()), len(questions['OwnerUserId'].unique()))

592057 592057 592057


To make sure all tag ids trace back to at least one question

In [38]:
print(len(tags[tags['Id'].isin(questions['Id'])]), len(tags))

2414518 2414518


Number of questions we will be dealing with

In [39]:
len(questions)

1173725

The amount of questions which have just a 0 score. For this exercise, we will be treating as having a score of one, becasue dropping these would drop about a fifth of our data

In [40]:
len(questions[questions['Score'] == 0])

546681

## Populating the matrices

This loop iterates through our dataset and adds 
1. 1 to each tag, *per tag per question* to the asker's row
2. The maximum of (1,Score) *per tag per question* to the asker's row. The max serves to treat 0 scores as 1

If score is less than 0, do nothing

In [None]:
for question_index, question in questions.iterrows():
    if question_index % 11_737 == 0:
        print(question_index / 11_737, '%')
    if question['Score'] < 0:
        continue
    question_tags = tags[tags['Id'] == question['Id']]
    for tag_index,tag in question_tags.iterrows():
        user_questions.at[question['OwnerUserId'],tag['Tag']] += 1
        user_scores.at[question['OwnerUserId'],tag['Tag']] += max(question['Score'],1)

In [47]:
user_scores.head(3).loc[:,['.net','actionscript-3', 'angularjs', 'asp.net', 'c#', 'cookies','css', 'date', 'flash', 'flex', 'generics','html', 'javascript', 'session', 'sqlite', 'tsql',  'vb.net', 'web-services', 'xml']]

Unnamed: 0_level_0,.net,actionscript-3,angularjs,asp.net,c#,cookies,css,date,flash,flex,generics,html,javascript,session,sqlite,tsql,vb.net,web-services,xml
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
26.0,6.0,26.0,2.0,4.0,18.0,6.0,10.0,1.0,1.0,30.0,2.0,10.0,4.0,6.0,1.0,1.0,1.0,4.0,4.0
58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83.0,89.0,0.0,0.0,65.0,6.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,44.0,0.0,0.0


In [48]:
user_questions.head(3).loc[:,['.net','actionscript-3', 'angularjs', 'asp.net', 'c#', 'cookies','css', 'date', 'flash', 'flex', 'generics','html', 'javascript', 'session', 'sqlite', 'tsql',  'vb.net', 'web-services', 'xml']]

Unnamed: 0_level_0,.net,actionscript-3,angularjs,asp.net,c#,cookies,css,date,flash,flex,generics,html,javascript,session,sqlite,tsql,vb.net,web-services,xml
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
26.0,1.0,1.0,1.0,4.0,3.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83.0,9.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0


In [50]:
user_scores.to_csv("../profiles/question_scores.csv",encoding="latin1",index='Id', )
user_questions.to_csv("../profiles/question_counts.csv",encoding="latin1",index='Id')

## Mapping User Score data and User question count data into a profile

In [72]:
user_scores = pd.read_csv("../profiles/question_scores.csv",encoding="latin1",index_col='Id')
user_questions = pd.read_csv("../profiles/question_counts.csv",encoding="latin1",index_col='Id')

### Treating users with all negative scores
For this exercise, we will dispose those users' questions from the question and tag datasets

In [None]:
bad_users = user_scores.loc[(user_scores==0).all(axis=1)]

In [None]:
tags = pd.read_csv("../input/Tags_Filtered.csv", encoding='latin1',index_col='Id')
questions = pd.read_csv("../input/Questions_Filtered.csv",encoding="latin1",index_col='Id')

In [80]:
questions[questions['OwnerUserId']== 25778].set_index('Id').join(tags.set_index('Id'))

Unnamed: 0_level_0,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,Tag
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
204970,25778.0,2008-10-15T14:40:11Z,,-3,split string fixed character sequence,suppose following string string asd test ass t...,java
204970,25778.0,2008-10-15T14:40:11Z,,-3,split string fixed character sequence,suppose following string string asd test ass t...,string


In [84]:
questions = questions[~questions['OwnerUserId'].isin(bad_users)]
tags = tags[~tags['Id'].isin(questions['Id'].tolist())]

In [88]:
tags = tags.to_csv("../input/Tags_Filtered.csv", encoding='latin1',index=False)
questions = questions.to_csv("../input/Questions_Filtered.csv",encoding="latin1",index=False)

In [89]:
user_scores = user_scores.loc[~(user_scores==0).all(axis=1)]
user_questions = user_questions.loc[~(user_questions==0).all(axis=1)]

In [95]:
print(len(user_scores), len(user_questions))

553866 553866


## Now that we've gotten rid of bad users, we are ready to generate profiles for them

### We will normalize a user's question count here

In [20]:
user_scores = pd.read_csv("../profiles/question_scores.csv",encoding="latin1",index_col='Id' )
user_questions = pd.read_csv("../profiles/question_counts.csv",encoding="latin1",index_col='Id')

In [42]:
bad_users = user_scores[user_scores.sum(axis=1) == 0].index
user_scores = user_scores[~user_scores.index.isin(bad_users)]
user_questions = user_questions[~user_questions.index.isin(bad_users)]

In [46]:
user_questions_index.shape == user_scores.shape

True

In [44]:
user_questions_index = user_questions.div(user_questions.sum(axis=1), axis=0)
user_questions_index.head(1)

Unnamed: 0_level_0,flex,actionscript-3,svn,sql,asp.net,algorithm,colors,c#,.net,c++,...,meteor,laravel,firebase,parse.com,typescript,docker,apache-spark,reactjs,spring-boot,ionic-framework
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26.0,0.115385,0.038462,0.0,0.0,0.153846,0.0,0.0,0.115385,0.038462,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
user_scores.head(1)

Unnamed: 0_level_0,flex,actionscript-3,svn,sql,asp.net,algorithm,colors,c#,.net,c++,...,meteor,laravel,firebase,parse.com,typescript,docker,apache-spark,reactjs,spring-boot,ionic-framework
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26.0,30.0,26.0,0.0,0.0,4.0,0.0,0.0,18.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Since we normalized the values, we expect the sum of al the rows to sum up to zero

In [45]:
user_questions_index.sample(1).sum(axis=1)

Id
5600344.0    1.0
dtype: float64

In [47]:
user_information = user_questions_index.multiply(user_scores)

In [48]:
user_information.head(1).loc[:,['.net','actionscript-3', 'angularjs', 'asp.net', 'c#', 'cookies','css', 'date', 'flash', 'flex', 'generics','html', 'javascript', 'session', 'sqlite', 'tsql',  'vb.net', 'web-services', 'xml']]

Unnamed: 0_level_0,.net,actionscript-3,angularjs,asp.net,c#,cookies,css,date,flash,flex,generics,html,javascript,session,sqlite,tsql,vb.net,web-services,xml
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
26.0,0.230769,1.0,0.076923,0.615385,2.076923,0.230769,0.384615,0.038462,0.038462,3.461538,0.076923,0.384615,0.153846,0.230769,0.038462,0.038462,0.038462,0.153846,0.153846


In this weighted average, the minimum "Knowledge" a person can have is if they asked one question which had a score of "1" to it (we count 0 as a 1 score wise)

In [49]:
min(user_information.sum(axis=1))

0.9999999999999998

In [51]:
user_information.to_csv("../profiles/question_profiles.csv",encoding="latin1",index='Id')