In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:

# import the required libraries

from collections import Counter
from scipy import sparse
import numpy as np
import pandas as pd
import pickle

In [2]:

# reading train and test file

train = pd.read_csv("train/train.csv")
test = pd.read_csv("test.csv")

In [3]:
print(train.shape)
train.head()

(903916, 4)


Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [4]:

# convert the train in the long format to wide format

wide_train = train.pivot_table(index = "user_id", columns="challenge_sequence", values="challenge", aggfunc= lambda x : x).reset_index()

In [5]:
# dropping the user_id, since we won't be needing those for our co-occurrence matrix
wide_train.drop('user_id',axis = 1,inplace = True)

In [6]:

wide_train.head(20)

challenge_sequence,1,2,3,4,5,6,7,8,9,10,11,12,13
0,CI23714,CI23855,CI24917,CI23663,CI23933,CI25135,CI23975,CI25126,CI24915,CI24957,CI24958,CI23667,CI23691
1,CI23663,CI23855,CI23933,CI23975,CI24530,CI23714,CI23648,CI23781,CI23667,CI25135,CI24915,CI25727,CI26051
2,CI26155,CI26156,CI26157,CI26158,CI26159,CI26160,CI26161,CI26162,CI26164,CI26165,CI26163,CI26166,CI26167
3,CI23855,CI24915,CI24917,CI23933,CI23663,CI24958,CI23975,CI23714,CI24953,CI24944,CI25135,CI26051,CI24957
4,CI23855,CI23975,CI24917,CI25135,CI23848,CI23714,CI23663,CI23933,CI24958,CI24915,CI24530,CI24187,CI25126
5,CI23933,CI25727,CI26051,CI25125,CI25124,CI25633,CI23663,CI26050,CI23667,CI24915,CI24031,CI23855,CI28240
6,CI23848,CI23855,CI23975,CI25135,CI23929,CI23714,CI23913,CI23663,CI25298,CI24917,CI23691,CI25733,CI25142
7,CI23855,CI23933,CI24530,CI23663,CI23714,CI24534,CI24915,CI24917,CI25135,CI24527,CI24958,CI24261,CI23648
8,CI23855,CI24917,CI25135,CI23848,CI23714,CI23975,CI23663,CI25142,CI23913,CI25126,CI23769,CI24958,CI24187
9,CI26155,CI26157,CI26158,CI26159,CI26160,CI26161,CI26162,CI26164,CI26165,CI26163,CI26166,CI26167,CI26168


In [7]:
# convert each row for a user into a string

rows = []
for index, row in wide_train.iterrows():
    r = " ".join(row.map(str))
    rows.append(r)

In [8]:

# converting test to wide format

wide_test = test.pivot_table(index = "user_id", columns="challenge_sequence", values="challenge", aggfunc= lambda x : x).reset_index()

In [9]:
wide_test.shape

(39732, 11)

In [10]:
# saving test user_id for future use

test_ids = wide_test['user_id']

In [11]:
# dropping user_id from wide test

wide_test.drop(["user_id"], axis =1, inplace = True)

In [12]:
for index, row in wide_test.iterrows():
    r = " ".join(row.map(str))
    rows.append(r)

In [13]:
# creating a corpus
thefile = open("corpus.txt","w")

In [14]:
for element in rows:
    thefile.write("%s\n"%element)

In [15]:
thefile.close()

In [16]:
# reading the corpus

corpus = open("corpus.txt","r")

In [17]:
corpus

<_io.TextIOWrapper name='corpus.txt' mode='r' encoding='UTF-8'>

In [18]:
# creating a dictionary with key = challenge_name and value = frequency
vocab = Counter()

In [19]:
for line in corpus:
    tokens = line.strip().split()
    vocab.update(tokens)

In [24]:
vocab = {word:(i,freq) for i,(word,freq) in enumerate(vocab.items())}
    

In [25]:
vocab

{'CI23714': (0, 12458),
 'CI23855': (1, 15184),
 'CI24917': (2, 12372),
 'CI23663': (3, 12077),
 'CI23933': (4, 11060),
 'CI25135': (5, 10482),
 'CI23975': (6, 7075),
 'CI25126': (7, 8550),
 'CI24915': (8, 7845),
 'CI24957': (9, 4873),
 'CI24958': (10, 11333),
 'CI23667': (11, 4723),
 'CI23691': (12, 9532),
 'CI24530': (13, 10166),
 'CI23648': (14, 6816),
 'CI23781': (15, 2984),
 'CI25727': (16, 2726),
 'CI26051': (17, 5030),
 'CI26155': (18, 1470),
 'CI26156': (19, 1218),
 'CI26157': (20, 1350),
 'CI26158': (21, 1283),
 'CI26159': (22, 1289),
 'CI26160': (23, 1133),
 'CI26161': (24, 1091),
 'CI26162': (25, 1459),
 'CI26164': (26, 1744),
 'CI26165': (27, 1537),
 'CI26163': (28, 949),
 'CI26166': (29, 1329),
 'CI26167': (30, 1673),
 'CI24953': (31, 5385),
 'CI24944': (32, 2525),
 'CI23848': (33, 9090),
 'CI24187': (34, 6697),
 'CI25125': (35, 4526),
 'CI25124': (36, 6018),
 'CI25633': (37, 620),
 'CI26050': (38, 3021),
 'CI24031': (39, 3708),
 'CI28240': (40, 232),
 'CI23929': (41, 2033

In [27]:
id2word = dict((i, word) for word, (i, _) in enumerate(vocab.items()))


In [28]:
id2word

{'CI23714': 0,
 'CI23855': 1,
 'CI24917': 2,
 'CI23663': 3,
 'CI23933': 4,
 'CI25135': 5,
 'CI23975': 6,
 'CI25126': 7,
 'CI24915': 8,
 'CI24957': 9,
 'CI24958': 10,
 'CI23667': 11,
 'CI23691': 12,
 'CI24530': 13,
 'CI23648': 14,
 'CI23781': 15,
 'CI25727': 16,
 'CI26051': 17,
 'CI26155': 18,
 'CI26156': 19,
 'CI26157': 20,
 'CI26158': 21,
 'CI26159': 22,
 'CI26160': 23,
 'CI26161': 24,
 'CI26162': 25,
 'CI26164': 26,
 'CI26165': 27,
 'CI26163': 28,
 'CI26166': 29,
 'CI26167': 30,
 'CI24953': 31,
 'CI24944': 32,
 'CI23848': 33,
 'CI24187': 34,
 'CI25125': 35,
 'CI25124': 36,
 'CI25633': 37,
 'CI26050': 38,
 'CI24031': 39,
 'CI28240': 40,
 'CI23929': 41,
 'CI23913': 42,
 'CI25298': 43,
 'CI25733': 44,
 'CI25142': 45,
 'CI24534': 46,
 'CI24527': 47,
 'CI24261': 48,
 'CI23769': 49,
 'CI26168': 50,
 'CI26940': 51,
 'CI26942': 52,
 'CI26943': 53,
 'CI26944': 54,
 'CI26945': 55,
 'CI26947': 56,
 'CI26946': 57,
 'CI26948': 58,
 'CI26954': 59,
 'CI26953': 60,
 'CI26950': 61,
 'CI26949': 62,
 '

In [30]:
vocab_size = len(vocab)
print(vocab_size)

5502


In [31]:
cooccurrences = sparse.lil_matrix((vocab_size, vocab_size),dtype=np.float64)
cooccurrences

<5502x5502 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in List of Lists format>

In [32]:

# context window size

window_size = 10

In [33]:
corpus = open("corpus.txt","r")

In [34]:

# Tuneable parameters : window_size, distance

for i, line in enumerate(corpus):
    tokens = line.strip().split()
    token_ids = [vocab[word][0] for word in tokens]
    
    for center_i, center_id in enumerate(token_ids):
        # Collect all word IDs in left window of center word
        context_ids = token_ids[max(0, center_i - window_size) : center_i]
        contexts_len = len(context_ids)

        for left_i, left_id in enumerate(context_ids):
            # Distance from center word
            
            distance = contexts_len - left_i

            # Weight by inverse of distance between words
            increment = 1.0 / float(distance)

            # Build co-occurrence matrix symmetrically (pretend we
            # are calculating right contexts as well)
            cooccurrences[center_id, left_id] += increment
            cooccurrences[left_id, center_id] += increment

In [35]:
# If anything other than None will exclude challenges whose frequencies are below this value.

min_count = None
#min_count = 20
print(min_count)

None


In [36]:
# filling the values in a matrix form

co_matrix = np.zeros([len(id2word),len(id2word)])

for i, (row, data) in enumerate(zip(cooccurrences.rows,cooccurrences.data)):
    if min_count is not None and vocab[id2word[i]][0] < min_count:
        continue
        
    for data_idx, j in enumerate(row):
        if min_count is not None and vocab[id2word[j]][0] < min_count:
            continue
            
        co_matrix[i,j] = data[data_idx]

In [37]:
co_matrix

array([[   0.        , 1218.48412698,  952.68690476, ...,    0.        ,
           0.        ,    0.        ],
       [1218.48412698,    0.        , 1221.0265873 , ...,    0.        ,
           0.        ,    0.        ],
       [ 952.68690476, 1221.0265873 ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       ...,
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ]])

In [38]:
#saving the mapping to a dictionary
pickle_path = "./vocab_mapping.pkl"
pickle_mapping = open(pickle_path,"wb")
pickle.dump(id2word, pickle_mapping)
pickle_mapping.close()

In [39]:
# saving the co-occurence matrix as a dataframe

co_occurence_dataframe = pd.DataFrame(co_matrix)

In [40]:
co_occurence_dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5492,5493,5494,5495,5496,5497,5498,5499,5500,5501
0,0.0,1218.484127,952.686905,1065.586905,985.600397,800.447222,690.27619,986.594841,522.625,260.590476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1218.484127,0.0,1221.026587,1252.965476,1092.545238,968.419048,926.743651,641.663095,721.213492,335.470635,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,952.686905,1221.026587,0.0,847.772222,969.061508,647.253571,509.603175,471.850397,797.802381,520.581746,...,0.0,0.0,0.0,0.0,0.142857,0.5,0.0,0.0,0.0,0.0
3,1065.586905,1252.965476,847.772222,0.0,976.407143,746.009921,705.35119,529.753571,646.126984,274.274603,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,985.600397,1092.545238,969.061508,976.407143,0.0,538.253175,487.259127,520.790476,588.089683,348.027778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:

res = {v:k for k,v in id2word.items()}

In [42]:
co_occurence_dataframe =co_occurence_dataframe.rename(columns=res)

In [43]:
co_occurence_dataframe = co_occurence_dataframe.rename(index=res)

In [44]:
co_occurence_dataframe.to_csv("co_matrix_with_window_size_1.csv", index = False)

In [45]:
co_occurence_dataframe.head()

Unnamed: 0,CI23714,CI23855,CI24917,CI23663,CI23933,CI25135,CI23975,CI25126,CI24915,CI24957,...,CI27326,CI29005,CI25760,CI28335,CI25962,CI25968,CI27314,CI27334,CI25342,CI28218
CI23714,0.0,1218.484127,952.686905,1065.586905,985.600397,800.447222,690.27619,986.594841,522.625,260.590476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI23855,1218.484127,0.0,1221.026587,1252.965476,1092.545238,968.419048,926.743651,641.663095,721.213492,335.470635,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI24917,952.686905,1221.026587,0.0,847.772222,969.061508,647.253571,509.603175,471.850397,797.802381,520.581746,...,0.0,0.0,0.0,0.0,0.142857,0.5,0.0,0.0,0.0,0.0
CI23663,1065.586905,1252.965476,847.772222,0.0,976.407143,746.009921,705.35119,529.753571,646.126984,274.274603,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI23933,985.600397,1092.545238,969.061508,976.407143,0.0,538.253175,487.259127,520.790476,588.089683,348.027778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
wide_test.head()

challenge_sequence,1,2,3,4,5,6,7,8,9,10
0,CI23855,CI23933,CI24917,CI24915,CI23714,CI23663,CI24958,CI25135,CI25727,CI24530
1,CI23663,CI23855,CI24917,CI23933,CI23975,CI23714,CI25135,CI24915,CI24958,CI23781
2,CI26939,CI26940,CI26941,CI26942,CI26943,CI26944,CI26945,CI26947,CI26948,CI26954
3,CI23663,CI23855,CI23975,CI23714,CI23848,CI23933,CI25135,CI23781,CI24530,CI23667
4,CI23855,CI23975,CI25135,CI23848,CI23714,CI24917,CI23929,CI25733,CI25126,CI23913


In [47]:
wide_test.shape

(39732, 10)

In [48]:

final_predictions = []

for i in range(0,39732):
    predictions = [wide_test.loc[i,10]]
    counter = 0
    for stimulus in predictions:
        predictions.append(co_occurence_dataframe[stimulus].idxmax())
        counter+=1
        if counter == 3:
            break
            
    final_predictions.append(predictions[1:])

In [49]:
# making predictions with the co-occurence_matrix based on 10th challenge only
final_predictions_new = []

for i in range(0,39732):
    stimulus = wide_test.loc[i,10]
    
    final_predictions_new.append(list(co_occurence_dataframe[stimulus].nlargest(3).index))

In [50]:
largest_3 = pd.DataFrame(final_predictions_new)

In [51]:
largest_3['user_id'] = test_ids

In [52]:

largest_3.head()

Unnamed: 0,0,1,2,user_id
0,CI23691,CI23714,CI23663,4577
1,CI23663,CI23714,CI23933,4578
2,CI26953,CI26955,CI26951,4579
3,CI23648,CI23663,CI23933,4583
4,CI23714,CI23855,CI25142,4584


In [53]:
largest_3_long = pd.melt(largest_3,id_vars="user_id",var_name="sequence", value_name="challenge" )

In [54]:
final_predictions

[['CI23691', 'CI24530', 'CI23691'],
 ['CI23663', 'CI23855', 'CI23663'],
 ['CI26953', 'CI26954', 'CI26953'],
 ['CI23648', 'CI24228', 'CI23702'],
 ['CI23714', 'CI23855', 'CI23663'],
 ['CI24228', 'CI23702', 'CI24228'],
 ['CI23691', 'CI24530', 'CI23691'],
 ['CI26167', 'CI26166', 'CI26167'],
 ['CI23714', 'CI23855', 'CI23663'],
 ['CI26166', 'CI26167', 'CI26166'],
 ['CI26167', 'CI26166', 'CI26167'],
 ['CI23855', 'CI23663', 'CI23855'],
 ['CI24534', 'CI24532', 'CI24534'],
 ['CI24228', 'CI23702', 'CI24228'],
 ['CI26166', 'CI26167', 'CI26166'],
 ['CI23769', 'CI24187', 'CI23769'],
 ['CI26953', 'CI26954', 'CI26953'],
 ['CI24876', 'CI25075', 'CI24876'],
 ['CI23714', 'CI23855', 'CI23663'],
 ['CI24530', 'CI23691', 'CI24530'],
 ['CI26939', 'CI26961', 'CI26960'],
 ['CI24534', 'CI24532', 'CI24534'],
 ['CI23855', 'CI23663', 'CI23855'],
 ['CI23855', 'CI23663', 'CI23855'],
 ['CI26954', 'CI26953', 'CI26954'],
 ['CI23648', 'CI24228', 'CI23702'],
 ['CI23769', 'CI24187', 'CI23769'],
 ['CI24876', 'CI25075', 'CI2

In [55]:
sub = pd.read_csv('sample_submission.csv')

In [59]:
seq = []
for i in final_predictions:
    for j in i:
        seq.append(j)

In [61]:
sub['challenge'] = seq

In [64]:
sub.to_csv('nlp_corr.csv',index = False)

Second way

In [4]:
df = pd.read_csv("train/challenge_data.csv")

In [5]:
print(df.shape)
df.head()

(5606, 9)


Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [6]:
print(train.shape)
train.head()

(903916, 4)


Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [12]:
train.rename(columns = {'challenge' : 'challenge_ID'},inplace = True)
test.rename(columns = {'challenge' : 'challenge_ID'},inplace = True)
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge_ID
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [13]:
#df2 = df.merge(train,on = 'challenge_ID')
df2_test = df.merge(test,on = 'challenge_ID')

In [11]:
print(df2.shape)
df2.head()

(903916, 12)


Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id,user_sequence,user_id,challenge_sequence
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,,32876_5,32876,5
1,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,,88820_5,88820,5
2,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,,97150_8,97150,8
3,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,,97359_8,97359,8
4,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0,37172_11,37172,11


In [17]:
print(df2_test.shape)
df2_test.head()

(397320, 12)


Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id,user_sequence,user_id,challenge_sequence
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,,83661_10,83661,10
1,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,,91425_4,91425,4
2,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0,46401_7,46401,7
3,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0,70194_5,70194,5
4,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0,93678_9,93678,9


In [31]:
print("No. of challenges: ", df2['challenge_ID'].nunique())
print("No. of progaming languages: ", df2['programming_language'].nunique())
print("Minimum submissions: ", df2['total_submissions'].min())
print("Max submissions: ", df2['total_submissions'].max())
print("No. of Authors: ", df2['author_ID'].nunique())
print("No. org of Authors: ", df2['author_org_ID'].nunique())
print("No. categories: ", df2['category_id'].nunique())

No. of challenges:  5348
No. of progaming languages:  3
Minimum submissions:  4.0
Max submissions:  43409.0
No. of Authors:  3327
No. org of Authors:  1646
No. categories:  189


In [30]:
df2.isnull().sum()

challenge_ID                0
programming_language        0
challenge_series_ID        64
total_submissions        3500
publish_date                0
author_ID                3473
author_gender            6078
author_org_ID           46360
category_id             39960
user_sequence               0
user_id                     0
challenge_sequence          0
dtype: int64

In [37]:
df[['challenge_ID','programming_language']].groupby('challenge_ID').count()

Unnamed: 0_level_0,programming_language
challenge_ID,Unnamed: 1_level_1
CI23478,1
CI23479,1
CI23480,1
CI23481,1
CI23482,1
CI23483,1
CI23484,1
CI23485,1
CI23486,1
CI23487,1


In [53]:
df2[['user_id','programming_language']].groupby('user_id').nunique().sort_values(by = 'programming_language',ascending = False)
#df2[df2['user_id'] == 77954]

Unnamed: 0_level_0,user_id,programming_language
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
77954,1,3
57693,1,3
31422,1,3
88894,1,3
48145,1,3
92964,1,3
43255,1,3
81008,1,3
106818,1,3
93360,1,3


In [None]:
df2['programming_language'] = df2['programming_language'].astype(str)
df2['total_submissions'] = df2['total_submissions'].astype(str)
df2['category_id'] = df2['category_id'].astype(str)

In [64]:
def create_soup(x):
    return ' '.join(x['programming_language']) + ' ' + ' '.join(x['total_submissions']) + ' ' + x['category_id'] 
df2['soup'] = df2.apply(create_soup, axis=1)

In [65]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])


In [66]:
type(count_matrix)

<903916x190 sparse matrix of type '<class 'numpy.int64'>'
	with 903916 stored elements in Compressed Sparse Row format>

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)