In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import os

In [2]:
# load the data
df = pd.read_csv('../Data/clean/mostStarredReposCleaned.csv')
df.head()

Unnamed: 0,id,name,description,url,stars,language,num_forks,open_issues
0,54346799,publicapis,collective list free APIs,https://api.github.com/repos/public-apis/publi...,278839,Python,31309,237
1,83222441,systemdesignprimer,Learn design largescale systems Prep system de...,https://api.github.com/repos/donnemartin/syste...,245714,Python,42633,415
2,63476337,Python,Algorithms implemented Python,https://api.github.com/repos/TheAlgorithms/Python,175617,Python,43858,224
3,123458551,Python100Days,Python 100,https://api.github.com/repos/jackfrued/Python-...,146346,Python,51102,706
4,1039520,youtubedl,Commandline program download videos YouTubecom...,https://api.github.com/repos/ytdl-org/youtube-dl,126899,Python,9785,4263


In [3]:
# delete missing values
df = df.dropna()

# check missing values
df.isnull().sum()

id             0
name           0
description    0
url            0
stars          0
language       0
num_forks      0
open_issues    0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1133 entries, 0 to 1145
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           1133 non-null   int64 
 1   name         1133 non-null   object
 2   description  1133 non-null   object
 3   url          1133 non-null   object
 4   stars        1133 non-null   int64 
 5   language     1133 non-null   object
 6   num_forks    1133 non-null   int64 
 7   open_issues  1133 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 79.7+ KB


# Preprocessing and Feature Engineering

### Encoding

In [5]:
# One-hot encoding of categorical variables
categorical_columns = ['language']
df = pd.get_dummies(df, columns=categorical_columns)
df.head()

Unnamed: 0,id,name,description,url,stars,num_forks,open_issues,language_C,language_C#,language_C++,...,language_Kotlin,language_Nix,language_PHP,language_Python,language_Ruby,language_Rust,language_Scala,language_Shell,language_Swift,language_TypeScript
0,54346799,publicapis,collective list free APIs,https://api.github.com/repos/public-apis/publi...,278839,31309,237,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,83222441,systemdesignprimer,Learn design largescale systems Prep system de...,https://api.github.com/repos/donnemartin/syste...,245714,42633,415,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,63476337,Python,Algorithms implemented Python,https://api.github.com/repos/TheAlgorithms/Python,175617,43858,224,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,123458551,Python100Days,Python 100,https://api.github.com/repos/jackfrued/Python-...,146346,51102,706,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1039520,youtubedl,Commandline program download videos YouTubecom...,https://api.github.com/repos/ytdl-org/youtube-dl,126899,9785,4263,0,0,0,...,0,0,0,1,0,0,0,0,0,0


### Vectorizing textual data and converting them to pytorch tensors for later model training

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['description'] = df['description'].fillna('').astype(str)

# Initializing TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

# Applying TF-IDF to 'description' column
tfidf_description = tfidf_vectorizer.fit_transform(df['description']).toarray()

# Converting TF-IDF results to PyTorch tensors
tfidf_description_tensor = torch.tensor(tfidf_description, dtype=torch.float32)

# Displaying the shape of the tensors
tfidf_description_tensor.shape

torch.Size([1133, 1000])

### Normalization

In [7]:
from sklearn.preprocessing import MinMaxScaler

# Selecting numerical features for normalization
numerical_features = ['stars', 'num_forks', 'open_issues']

# Check for NaNs in numerical features
if df[numerical_features].isnull().values.any():
    print("NaN values found in numerical features, handling them...")
    # Handle NaNs here, e.g., df[numerical_features].fillna(df[numerical_features].mean(), inplace=True)

# Initializing the MinMaxScaler
scaler = MinMaxScaler()

# Applying normalization to the numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Check if NaNs are introduced by the scaler
if np.isnan(df[numerical_features].values).any():
    print("NaN values introduced after scaling")

# Converting the normalized data to PyTorch tensors
normalized_features_tensor = torch.tensor(df[numerical_features].values, dtype=torch.float32)

# Displaying the first few rows of the normalized dataframe and the shape of the tensor
df[numerical_features].head(), normalized_features_tensor.shape


(      stars  num_forks  open_issues
 0  0.727603   0.350101     0.008399
 1  0.641047   0.476739     0.014707
 2  0.457884   0.490438     0.007938
 3  0.381399   0.571449     0.025020
 4  0.330584   0.109394     0.151079,
 torch.Size([1133, 3]))

#### Simulated User Preferences

In [8]:
# Creating a simulated user-repo interaction matrix
simulated_users = {'user1': 'language_Python', 'user2': 'language_JavaScript', 'user3': 'language_Java'}

# Initialize the interaction matrix as a NumPy array of zeros
num_rows = df.shape[0]
interaction_matrix_np = np.zeros((num_rows, len(simulated_users)))

# Populate the interaction matrix using NumPy
user_columns = [simulated_users[user] for user in simulated_users]
for i, column in enumerate(user_columns):
    interaction_matrix_np[:, i] = df[column].values

# Convert the NumPy array to a Pandas DataFrame (optional, for verification)
interaction_matrix_df = pd.DataFrame(interaction_matrix_np, columns=simulated_users.keys())

# Convert the interaction matrix to a PyTorch tensor
interaction_matrix_tensor = torch.tensor(interaction_matrix_np, dtype=torch.float32)

In [9]:
from sklearn.model_selection import train_test_split

# Assume normalized_features_tensor is a tensor of your numerical features
# Concatenate all features (content, collaborative, numerical) into one tensor
all_features = torch.cat((tfidf_description_tensor, normalized_features_tensor, interaction_matrix_tensor), dim=1)

# Create a dummy target variable (This should be replaced with your actual target variable)
target = torch.randint(0, 2, (all_features.shape[0], 1), dtype=torch.float)

# Splitting the data into training and test sets
features_train, features_test, target_train, target_test = train_test_split(
    all_features, target, test_size=0.2, random_state=42
)

In [10]:
import torch.nn as nn
import torch.nn.functional as F

class HybridRecommendationModel(nn.Module):
    def __init__(self, num_features, num_hidden=128):
        super(HybridRecommendationModel, self).__init__()
        self.fc1 = nn.Linear(num_features, num_hidden)
        self.fc2 = nn.Linear(num_hidden, num_hidden)
        self.output = nn.Linear(num_hidden, 1)
        
        # Weight initialization
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.xavier_uniform_(self.output.weight)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.output(x))
        return x

# Number of features in your dataset
num_features = features_train.shape[1]

# Instantiate the model
model = HybridRecommendationModel(num_features)

In [11]:
# Converting input tensors to float32
features_train = features_train.float()
features_test = features_test.float()
target_train = target_train.float()
target_test = target_test.float()

In [12]:
# Define a loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

batch_size = 32  # Adjust based on your dataset
# Gradient clipping value
clip_value = 1.0

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    loss = None  # Initialize loss to None or a default value
    for i in range(0, len(features_train), batch_size):
        batch_features = features_train[i:i + batch_size]
        batch_targets = target_train[i:i + batch_size]

        # Forward pass
        outputs = model(batch_features)
        outputs = torch.clamp(outputs, 0, 1)

        loss = criterion(outputs, batch_targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()

    # Check if loss is available before printing
    if loss is not None:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")
    else:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss calculation was skipped")

Epoch [1/100], Loss: 0.6832929849624634
Epoch [2/100], Loss: 0.6824767589569092
Epoch [3/100], Loss: 0.6811962127685547
Epoch [4/100], Loss: 0.6794062852859497
Epoch [5/100], Loss: 0.6770769953727722
Epoch [6/100], Loss: 0.6741318106651306
Epoch [7/100], Loss: 0.6705149412155151
Epoch [8/100], Loss: 0.6660303473472595
Epoch [9/100], Loss: 0.6605780720710754
Epoch [10/100], Loss: 0.6537641882896423
Epoch [11/100], Loss: 0.6452128887176514
Epoch [12/100], Loss: 0.6345345973968506
Epoch [13/100], Loss: 0.6213228106498718
Epoch [14/100], Loss: 0.6049414873123169
Epoch [15/100], Loss: 0.5851572751998901
Epoch [16/100], Loss: 0.5617567300796509
Epoch [17/100], Loss: 0.5344243049621582
Epoch [18/100], Loss: 0.503656804561615
Epoch [19/100], Loss: 0.4703712463378906
Epoch [20/100], Loss: 0.4345739483833313
Epoch [21/100], Loss: 0.3959755599498749
Epoch [22/100], Loss: 0.35644999146461487
Epoch [23/100], Loss: 0.3180052638053894
Epoch [24/100], Loss: 0.28094252943992615
Epoch [25/100], Loss: 0.

In [13]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    test_outputs = model(features_test)
    test_loss = criterion(test_outputs, target_test)
print(f"Test Loss: {test_loss.item()}")

Test Loss: 1.9806822538375854
