In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter
from sklearn.feature_extraction.text import CountVectorizer
import random
import datetime

<p style='font-size:20px'><b> Dataset re-evaluation </b></p>
<p>

Based on the modelling implemented, following are the best result obtained on test data so far: 

Model A) Log-reg, l1 penalty, alpha = 3.25e-6 | Micro-avg F1-score = 56.0%, Macro-avg F1-score = 43.2%

Model B) Lin-SVM, l1 penalty, alpha = 3.25e-6 | Micro-avg F1-score = 57.1%, Macro-avg F1-score = 42.1%

- In both cases, the Macro-avg score is quite low compared to the Micro-avg F1 score.
- This implies that the F1-score for the less frequent tags is quite lower that the top tags which may be due low datapoints having these tags since the points were sampled randomply.
- To rectify this, we will now sample the data to capture atleast 1000 datapoints for each of the 500 tags to be predicted.
    
</p>


In [3]:
start_time = datetime.datetime.now()

# Load the Train.csv dataset for Stack over flow tagging problem 
raw_data = pd.read_csv('dataset/Train.csv')

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:06:39.320949


<p style='font-size:16px'><b> Old dataset comparison (1.5L points)  </b></p>

<p>

Plotting the number of data points associated to each tag in the old dataset havng 1.5L datapoints. 
    
</p>


In [5]:
start_time = datetime.datetime.now()

# Load the trimmed data set of 1.25L datapoints for preprocessing
raw_data_old = pd.read_csv('dataset/trimmed.csv')

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:00:05.098217


In [7]:
start_time = datetime.datetime.now()

# List to store each tagging instance accross all datapoints
tag_list = []

tags = [str(x).split(" ") for x in raw_data_old.Tags_shortlist]

for datapoint in tags:
    for tag in datapoint:
        tag_list.append(tag)

# Calculating the unique tags & their frequency in a descending order from the above tag_list        
unique_tags, tag_count = np.unique(tag_list, return_counts=True)
sort_indices = np.argsort(tag_count)[::-1]
unique_tags = unique_tags[sort_indices]

print("1. Number of unique tags: ", len(unique_tags))

cvt = CountVectorizer(analyzer = lambda x:x)
ques_tag_map = cvt.fit_transform(tags)
tag_vocab = cvt.vocabulary_

_, tag_count = np.unique(ques_tag_map.nonzero()[1], return_counts=True)
tag_count.sort()

print("\n2. Number of questions associated to a tag: \n", tag_count)

print("\n3. Lowest number of questions associated to a tag: ", np.min(tag_count))

print("\nBlock execution time: ", datetime.datetime.now() - start_time)

1. Number of unique tags:  500

2. Number of questions associated to a tag: 
 [  116   121   123   124   124   124   126   128   129   129   129   130
   130   131   134   134   135   136   136   136   137   137   137   138
   138   138   139   139   139   139   140   140   140   141   141   142
   142   142   142   143   144   144   145   145   145   145   146   146
   146   147   147   147   147   147   147   147   148   148   149   149
   149   150   150   150   151   152   154   154   154   155   156   156
   156   157   157   157   157   158   158   158   158   158   158   158
   159   160   160   160   160   160   160   160   161   161   161   161
   162   162   162   163   163   164   165   165   165   166   166   166
   167   167   167   168   168   168   168   168   168   169   169   171
   171   172   172   172   172   172   173   174   175   175   176   176
   176   177   177   178   178   178   178   178   178   178   179   179
   180   180   180   180   181   181   181   1

In [23]:
print(raw_data_old.shape)

(150000, 4)


<p style='font-size:16px'><b> Inferences (Old data)  </b></p>

<p>

- Total number of points in the dataset = 150000
- Lowest number & % of questions associated to a tag = 116, 0.0073%
    
</p>


<p style='font-size:16px'><b> New data (Tag based Even sampling)  </b></p>


In [9]:
# drop the duplicate rows and create a new dataset

start_time = datetime.datetime.now()

raw_data = raw_data.drop_duplicates(subset = ['Title', 'Body', 'Tags'])

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:17:16.971711


In [11]:
start_time = datetime.datetime.now()

tags = [str(x).split(" ") for x in raw_data.Tags]

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:00:12.082068


In [12]:
start_time = datetime.datetime.now()

# List to store each tagging instance accross all datapoints
tag_list = []

for datapoint in tags:
    for tag in datapoint:
        tag_list.append(tag)

# Calculating the unique tags & their frequency in a descending order from the above tag_list        
unique_tags, tag_count = np.unique(tag_list, return_counts=True)
sort_indices = np.argsort(tag_count)[::-1]
unique_tags = unique_tags[sort_indices]

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:00:17.675524


In [13]:
start_time = datetime.datetime.now()

# Creating a sparse representation of the datapoint/question number and associated tags
cvt = CountVectorizer(analyzer = lambda x:x)
ques_tag_map = cvt.fit_transform(tags)
tag_vocab = cvt.vocabulary_
    
print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:00:13.428589


In [14]:
start_time = datetime.datetime.now()

# Reducing data size for adjusting to PC computational limitations

# Filtering out the datapoints that don't have any of the top 500 tags as the associated tags
num_top_tags = 500
top_tags = unique_tags[:num_top_tags]
tag_inds = [tag_vocab[tag] for tag in top_tags]
relev_ques = ques_tag_map[:, tag_inds]

minq_per_tag = 1000
minq_ind = set()

for i in range(num_top_tags):
    
    ques = relev_ques[:, i].nonzero()[0]
    temp = min(minq_per_tag, len(ques))
    rand_inds = random.sample(range(0, len(ques)), temp)
    minq_ind.update(ques[rand_inds])
    
trimmed_data = raw_data.iloc[list(minq_ind)]
print(trimmed_data.shape)
    
print("\nBlock execution time: ", datetime.datetime.now() - start_time)

(475777, 4)

Block execution time:  0:01:37.715584


In [15]:
start_time = datetime.datetime.now()

# Creating a new column name Tags_shortlist that contains only the top 500 tags from the Tags column
tags_temp = [str(x).split(" ") for x in trimmed_data.Tags]
shortlist_tags = [[item for item in datapoint if item in top_tags] for datapoint in tags_temp]
shortlist_tags = [" ".join(item) for item in shortlist_tags]
trimmed_data['Tags_shortlist'] = shortlist_tags

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:00:15.116112


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trimmed_data['Tags_shortlist'] = shortlist_tags


In [17]:
start_time = datetime.datetime.now()

# List to store each tagging instance accross all datapoints
tag_list = []

tags = [str(x).split(" ") for x in trimmed_data.Tags_shortlist]

for datapoint in tags:
    for tag in datapoint:
        tag_list.append(tag)

# Calculating the unique tags & their frequency in a descending order from the above tag_list        
unique_tags, tag_count = np.unique(tag_list, return_counts=True)
sort_indices = np.argsort(tag_count)[::-1]
unique_tags = unique_tags[sort_indices]

print("1. Number of unique tags: ", len(unique_tags))

cvt = CountVectorizer(analyzer = lambda x:x)
ques_tag_map = cvt.fit_transform(tags)
tag_vocab = cvt.vocabulary_

_, tag_count = np.unique(ques_tag_map.nonzero()[1], return_counts=True)
tag_count.sort()

print("\n2. Number of questions associated to a tag: \n", tag_count)

print("\n3. Lowest number of questions associated to a tag: ", np.min(tag_count))

print("\nBlock execution time: ", datetime.datetime.now() - start_time)

1. Number of unique tags:  500

2. Number of questions associated to a tag: 
 [ 1105  1132  1135  1140  1148  1149  1150  1152  1154  1157  1160  1161
  1161  1162  1163  1163  1163  1165  1167  1173  1179  1182  1183  1191
  1191  1196  1196  1198  1198  1203  1204  1206  1206  1209  1209  1210
  1215  1216  1219  1220  1221  1225  1228  1229  1230  1232  1236  1239
  1243  1245  1246  1249  1249  1251  1251  1251  1252  1252  1254  1254
  1254  1255  1258  1259  1260  1260  1260  1261  1261  1262  1262  1263
  1263  1264  1264  1266  1266  1269  1270  1272  1273  1273  1274  1275
  1275  1276  1277  1277  1277  1278  1280  1280  1283  1283  1285  1287
  1288  1288  1289  1291  1294  1295  1296  1298  1298  1299  1300  1300
  1301  1301  1302  1302  1303  1304  1305  1307  1309  1310  1311  1311
  1313  1315  1316  1318  1321  1321  1322  1322  1323  1324  1327  1327
  1327  1328  1333  1333  1334  1337  1339  1340  1342  1343  1345  1346
  1347  1349  1350  1351  1352  1353  1355  13

<p style='font-size:16px'><b> Inferences (New data)  </b></p>

<p>

- Total number of points in the dataset = 475777
- Lowest number & % of questions associated to a tag = 1105, 0.232%
    
Compared to the old data, the tag with lowest frequency still accounts for 0.232 datapoints compared to 0.0773% in the old dataset. 


</p>


In [25]:
start_time = datetime.datetime.now()

# Save data to csv file
#### Imp: Careful not to overwrite data ###

try:
    trimmed_data.to_csv('dataset 5L/trimmed.csv', mode = 'w', index = False)
except:
    print("File already saved?")

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:01:09.076848
