## This Jupyter Notebook generates the Kaggle Challenge Dataset from Original User Tag Matrix as well the Synthetic User Tag Matrix that was generated using CT-GAN model. 

### We include both the original, as well as the synthetic dataset in the train and test sets. The reason for this is specified in the report; in brief; we observed that performance on pure synthetic data was better than performance on pure original data, on the Alternate Least Squares Model. Thus, we took the combination of these two models as the average performance is almost the average of the individual performances on these two datasets by the ALS algorithm.

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

We load the original and synthetic datasets from the specified paths. (modify paths accordingly)

In [2]:
# Load the datasets
original_df = pd.read_csv("/home/girikk/Concept positioning system/Challenge Dataset/Final Dataset/UTM3.csv")
# synthetic_df = pd.read_csv("/home/girikk/Concept positioning system/Challenge Dataset/Final Dataset/synthetic_data_matrix-75krows.csv")

We filter both datasets to include only rows with at least 5 tags.

In [3]:
# Filter rows with at least 5 tags
original_df = original_df[original_df.drop(columns="OwnerUserId").sum(axis=1) >= 5]
# synthetic_df = synthetic_df[synthetic_df.drop(columns="OwnerUserId").sum(axis=1) >= 5]

The Tag Descriptions are loaded

In [4]:
tag_descriptions_df = pd.read_csv('/home/girikk/Concept positioning system/Challenge Dataset/Final Dataset/Tag Descriptions.csv')
tag_to_id = dict(zip(tag_descriptions_df['TagName'], tag_descriptions_df['TagID']))

We remove duplicate users from the synthetic dataset that are already in the original dataset. (hardly 1 or 2)

In [5]:
# Remove duplicates from synthetic_df that are already in original_df
# synthetic_df = synthetic_df[~synthetic_df['OwnerUserId'].isin(original_df['OwnerUserId'])]

The datasets are merged and shuffled.

In [6]:
# Merge the datasets
# merged_df = pd.concat([original_df, synthetic_df], ignore_index=True)

# Shuffle the merged dataframe
merged_df = original_df.sample(frac=1, random_state=42).reset_index(drop=True)

New unique user IDs are created to anonymize the data.

In [7]:
# Create new unique OwnerUserIds
unique_ids = merged_df['OwnerUserId'].unique()
new_unique_ids = np.arange(1, len(unique_ids) + 1)
np.random.shuffle(new_unique_ids)
id_mapping = dict(zip(unique_ids, new_unique_ids))

# Update the OwnerUserId in the merged dataframe
merged_df['NewOwnerUserId'] = merged_df['OwnerUserId'].map(id_mapping)

In [8]:
merged_df

Unnamed: 0,OwnerUserId,recursion,hashtable,hashmap,boolean-polynomials,digraphs,probing,stack-allocation,heap-size,cyclic-graph,...,simplex,hamiltonian-path,array-comparison,space-complexity,traveling-salesman,boolean-logic,heuristics,bitstring,tree,NewOwnerUserId
0,1828486,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,47918
1,1361822,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8354
2,127480,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,7067
3,2119053,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4527
4,278842,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,11977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51538,408773,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,42327
51539,7238916,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,47696
51540,4350586,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31063
51541,7898,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1326


In [9]:
filtered_df = merged_df[merged_df.NewOwnerUserId == 103820]
filtered_df.columns[(filtered_df == 1).any()].tolist()

[]

In [10]:
id_mapping

{1828486: 47918,
 1361822: 8354,
 127480: 7067,
 2119053: 4527,
 278842: 11977,
 13825434: 17719,
 677472: 27951,
 4142343: 44846,
 1267329: 50168,
 182689: 31355,
 3014199: 14108,
 6535399: 14698,
 5749570: 21651,
 1850983: 48283,
 1276128: 34687,
 6752997: 3110,
 978360: 14002,
 2150063: 28942,
 1895611: 48001,
 14585634: 22914,
 2908254: 33989,
 322912: 41839,
 948128: 973,
 1419166: 24766,
 1168654: 3942,
 1839453: 37932,
 116712: 9379,
 584183: 20512,
 1906688: 5303,
 585263: 10655,
 1034255: 13967,
 4080489: 36295,
 147511: 26416,
 2756719: 8932,
 247357: 37609,
 4700966: 18621,
 428442: 28028,
 5132337: 17513,
 555830: 29238,
 9706: 45038,
 9283016: 1973,
 1567452: 16663,
 8478737: 20862,
 250259: 20396,
 3171: 37052,
 3195526: 22942,
 6919895: 18250,
 3301760: 13513,
 8255744: 35666,
 5600318: 42939,
 13365: 38027,
 497982: 48211,
 1174750: 9127,
 2792896: 15543,
 12983643: 6813,
 2656421: 40840,
 31486: 23726,
 1244: 5379,
 11025476: 47472,
 296974: 35469,
 4265739: 4743,
 136

this below code stores the id mapping table into a dataframe. this allows you to revert back from merged df to original synthetic and real df

In [11]:
df = pd.DataFrame(list(id_mapping.items()), columns=['OriginalUserId', 'RandomizedUserId'])
# # Initialize the 'table' column with default value
# df['table'] = ''

# # Update the 'table' column based on the source dataframe
# df.loc[df['OriginalUserId'].isin(original_df['OwnerUserId']), 'table'] = 'original_df'
# df.loc[df['OriginalUserId'].isin(synthetic_df['OwnerUserId']), 'table'] = 'synthetic_df'


df.to_csv("id_mapping.csv")

We create user-tag pairs, now checking each column for a value of 1 to indicate the presence of a tag

In [12]:
# Create user-tag tuple pairs
user_tag_dict = defaultdict(list)
for _, row in merged_df.iterrows():
    user_id = row['NewOwnerUserId']
    # Iterate through all columns except 'OwnerUserId' and 'NewOwnerUserId'
    for tag, value in row.drop(['OwnerUserId', 'NewOwnerUserId']).items():
        if value == 1 and tag in tag_to_id:  # Check if the tag is present and in our Tag Descriptions
            user_tag_dict[user_id].append(tag_to_id[tag])


The data is split into train and test sets, with users having 10 or more tags contributing to both sets.

In [13]:
# Split into train and test sets
train_pairs = []
test_pairs = []

for user_id, tag_ids in user_tag_dict.items():
    if len(tag_ids) >= 10:
        # Randomly select 5 tags for test set if user has 10 or more tags
        test_sample = np.random.choice(tag_ids, 5, replace=False)
        train_sample = [tag_id for tag_id in tag_ids if tag_id not in test_sample]
        
        for tag_id in test_sample:
            test_pairs.append((user_id, tag_id))
        for tag_id in train_sample:
            train_pairs.append((user_id, tag_id))
    else:
        # If user has less than 10 tags, all go to train set
        for tag_id in tag_ids:
            train_pairs.append((user_id, tag_id))

We create and save both pair-wise (UserID, TagID) and matrix-like representations of the data.

In [14]:
# Create dataframes from the train and test pairs
train_df = pd.DataFrame(train_pairs, columns=['UserID', 'TagID'])
test_df = pd.DataFrame(test_pairs, columns=['UserID', 'TagID'])

# Save the train and test sets to CSV files
train_df.to_csv('train_user_tag_pairs.csv', index=False)
test_df.to_csv('test_user_tag_pairs.csv', index=False)


In [15]:
train_df

Unnamed: 0,UserID,TagID
0,47918,58
1,47918,163
2,47918,72
3,47918,162
4,47918,161
...,...,...
412077,39885,28
412078,39885,163
412079,39885,161
412080,39885,73


In [16]:
test_ids = test_df.drop(columns=["TagID"])
test_ids = test_ids.drop_duplicates()
test_ids

Unnamed: 0,UserID
0,8354
5,7067
10,11977
15,50168
20,31355
...,...
73780,6832
73785,9693
73790,38776
73795,31482


In [17]:
test_ids.to_csv("test.csv", index=False)

In [18]:
# Create and save matrix-like dataframes
# Create pivot tables for train and test sets
train_matrix_df = pd.pivot_table(train_df, values='TagID', index='UserID', 
                                 columns='TagID', aggfunc=lambda x: 1, fill_value=0)
test_matrix_df = pd.pivot_table(test_df, values='TagID', index='UserID', 
                                columns='TagID', aggfunc=lambda x: 1, fill_value=0)


In [19]:
# Ensure all TagIDs are present in both dataframes
all_tag_ids = sorted(set(train_matrix_df.columns) | set(test_matrix_df.columns))

for tag_id in all_tag_ids:
    if tag_id not in train_matrix_df.columns:
        train_matrix_df[tag_id] = 0
    if tag_id not in test_matrix_df.columns:
        test_matrix_df[tag_id] = 0

# Sort columns to ensure consistency
train_matrix_df = train_matrix_df.reindex(sorted(train_matrix_df.columns), axis=1)
test_matrix_df = test_matrix_df.reindex(sorted(test_matrix_df.columns), axis=1)

# Save matrix-like dataframes to CSV
train_matrix_df.to_csv('train_matrix.csv')
test_matrix_df.to_csv('test_matrix.csv')

In [20]:
train_matrix_df

TagID,1,2,3,4,5,6,7,8,9,10,...,222,223,224,225,226,227,228,229,230,231
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51539,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
51540,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
51541,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
51542,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Creating solution.csv

In [21]:
import pandas as pd
import numpy as np
df = pd.read_csv("/home/girikk/Concept positioning system/Challenge Dataset/FInal Dataest only real only only real/test_user_tag_pairs.csv")
df

Unnamed: 0,UserID,TagID
0,8354,164
1,8354,71
2,8354,3
3,8354,73
4,8354,58
...,...,...
73800,10488,164
73801,10488,136
73802,10488,161
73803,10488,69


In [22]:
# Add a sequential count within each user group
df['TagOrder'] = df.groupby('UserID').cumcount() + 1

# Pivot the dataframe to get each UserID with 5 TagIDs as columns
df_pivoted = df.pivot(index='UserID', columns='TagOrder', values='TagID')
df_pivoted

TagOrder,1,2,3,4,5
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,180,55,230,20,94
4,170,58,12,169,9
19,15,161,116,42,73
21,53,181,166,164,132
23,163,169,166,168,57
...,...,...,...,...,...
51532,164,170,51,161,57
51536,170,164,9,55,169
51538,34,164,28,162,71
51539,69,163,53,28,161


In [23]:
# Rename columns to Tag1, Tag2, Tag3, Tag4, Tag5
df_pivoted.columns = [f'Tag{i}' for i in range(1, 6)]

# Reset index to make UserID a column again
df_pivoted = df_pivoted.reset_index()

In [24]:
df_pivoted

Unnamed: 0,UserID,Tag1,Tag2,Tag3,Tag4,Tag5
0,1,180,55,230,20,94
1,4,170,58,12,169,9
2,19,15,161,116,42,73
3,21,53,181,166,164,132
4,23,163,169,166,168,57
...,...,...,...,...,...,...
14756,51532,164,170,51,161,57
14757,51536,170,164,9,55,169
14758,51538,34,164,28,162,71
14759,51539,69,163,53,28,161


In [25]:
# Add a Usage column
np.random.seed(0)  # For reproducibility
usage_choices = np.random.choice(['Public', 'Private'], size=len(df_pivoted), p=[0.3, 0.7])
df_pivoted['Usage'] = usage_choices


In [26]:
df_pivoted

Unnamed: 0,UserID,Tag1,Tag2,Tag3,Tag4,Tag5,Usage
0,1,180,55,230,20,94,Private
1,4,170,58,12,169,9,Private
2,19,15,161,116,42,73,Private
3,21,53,181,166,164,132,Private
4,23,163,169,166,168,57,Private
...,...,...,...,...,...,...,...
14756,51532,164,170,51,161,57,Public
14757,51536,170,164,9,55,169,Public
14758,51538,34,164,28,162,71,Public
14759,51539,69,163,53,28,161,Public


In [27]:
df_pivoted.to_csv("solutions.csv", index=False)

### Users.csv based on id_mapping

In [28]:
users_data = pd.read_csv("/home/girikk/Concept positioning system/Challenge Dataset/User Profiles.csv")
users_data

Unnamed: 0,Id,Reputation,Views,UpVotes,DownVotes,AccountAgeDays,DaysSinceLastActivity
0,1,63708,596583,3270,1108,5776,97
1,3,15767,30431,7765,91,5776,391
2,4,33554,84765,827,88,5776,48
3,5,52637,15464,759,26,5776,444
4,13,221192,46081,5135,193,5775,666
...,...,...,...,...,...,...,...
499541,228450,12977,330,276,28,5279,54
499542,228461,597,54,120,3,5279,48
499543,228489,13527,453,346,17,5279,440
499544,228508,3404,39,204,3,5279,54


In [29]:
# Filter out rows where Id is not in the dictionary
filtered_users_data = users_data[users_data['Id'].isin(id_mapping.keys())]

# Replace Ids with shuffled Ids
filtered_users_data['Id'] = filtered_users_data['Id'].map(id_mapping)

filtered_users_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_users_data['Id'] = filtered_users_data['Id'].map(id_mapping)


Unnamed: 0,Id,Reputation,Views,UpVotes,DownVotes,AccountAgeDays,DaysSinceLastActivity
0,25397,63708,596583,3270,1108,5776,97
2,50351,33554,84765,827,88,5776,48
3,6032,52637,15464,759,26,5776,444
4,9083,221192,46081,5135,193,5775,666
5,44432,54667,5804,847,199,5775,435
...,...,...,...,...,...,...,...
499517,21528,9126,668,1393,29,5280,57
499523,34577,1027,68,33,0,5280,3818
499524,50982,15127,716,407,88,5280,337
499534,41026,8508,1013,747,32,5280,50


In [30]:
filtered_users_data.to_csv("user_profile.csv", index=False)