Text Preprocessing

In [None]:
# Install necessary libraries
!pip install nltk pandas beautifulsoup4

# Import required libraries
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
file_name = 'games_apps.csv'  # Your dataset file
df = pd.read_csv(file_name)

# Initialize stopword list and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean HTML content
def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Function to clean text
def clean_text(text):
    # 1. Remove HTML content
    text = clean_html(text)

    # 2. Remove punctuation, non-alphabet characters, and extra spaces
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Keep only alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # 3. Convert text to lowercase
    text = text.lower()

    return text

# Function to preprocess text
def preprocess_text(text):
    # Clean the text
    text = clean_text(text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize tokens
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    # Join the processed tokens back into a single string
    processed_text = ' '.join(processed_tokens)

    return processed_text

# Apply the preprocessing to the 'description' column
df['description'] = df['description'].apply(preprocess_text)

# Select only the 'description' and 'genreId' columns
output_df = df[['description', 'genreId']]

# Save the resulting dataframe to a new CSV file
output_file = 'games_apps_processed.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed dataset saved to {output_file}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
  return BeautifulSoup(text, "html.parser").get_text()


Processed dataset saved to games_apps_processed.csv


Generative Element

In [None]:
# Install necessary libraries in Google Colab
!pip install transformers pandas

# Import libraries
import pandas as pd
from transformers import pipeline

# Install necessary libraries in Google Colab
!pip install transformers pandas

# Import libraries
import pandas as pd
from transformers import pipeline

# Initialize a summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:


# Load the dataset
file_name = 'games_apps_processed.csv'
#df = pd.read_csv(file_name)
headers = pd.read_csv(file_name, nrows=0).columns

# Now, read rows 31 to 100
df = pd.read_csv(file_name, skiprows=1500, nrows=25, header=None, names=headers)

# Function to summarize text in batches
def summarize_texts_in_batches(texts, batch_size=8):
    summaries = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        try:
            # Generate summaries for the batch
            summary_outputs = summarizer(batch, max_length=100, min_length=30, do_sample=False)
            summaries.extend([output['summary_text'] for output in summary_outputs])
        except Exception as e:
            print(f"Error during summarization: {e}")
            summaries.extend(batch)  # Return original texts in case of an error
    return summaries

# Apply the summarization function to all descriptions in batches
df['summary'] = summarize_texts_in_batches(df['description'].tolist(), batch_size=8)

# Select the required columns
output_df = df[['description', 'summary', 'genreId']]

# Save the updated dataframe back to a new CSV file
output_file = 'games_apps_summarized.csv'
output_df.to_csv(output_file, index=False)

print(f"Summarized dataset saved to {output_file}")


Your max_length is set to 100, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 100, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 100, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 100, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your m

Summarized dataset saved to games_apps_summarized.csv


Merging both texts

In [26]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('games_apps_summarized.csv')
df['merged_description_summary'] = df['description'] + " " + df['summary']
df = df.drop(['description', 'summary'], axis=1)
df.to_csv('updated_games_apps.csv', index=False)

Embdeing generation, clustering and evaluations

In [3]:
!pip install -q sentence-transformers
!pip install -q scikit-learn


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/245.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m235.5/245.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score

# Load the Sentence Transformer model for generating embeddings
model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')

# Extract the relevant dataframe
#df = genre_dfs["GAME_CASUAL"]



  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score

# Initialize a single dataframe to store results for all cluster sizes
df_results = pd.DataFrame(columns=["n_clusters",
                                   "Silhouette Score (Agglomerative)", "Silhouette Score (KMeans)",
                                   "Davies-Bouldin Score (Agglomerative)", "Davies-Bouldin Score (KMeans)",
                                   "Adjusted Rand Index"])

# Load the dataframe
df = pd.read_csv('updated_games_apps.csv')

# Generate embeddings for the 'merged_description_summary' column
embeddings = model.encode(df['merged_description_summary'].tolist(), batch_size=32, show_progress_bar=True)

# Define different cluster sizes
n_clusters_list = [22, 150, len(df) // 10]  # n/10 is the integer division of the number of rows by 10

# Iterate over each cluster size
for n_clusters in n_clusters_list:
    print(f"\nClustering with n_clusters = {n_clusters}")

    # Perform Agglomerative Clustering (Ward linkage)
    agg_cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    agg_labels = agg_cluster.fit_predict(embeddings)

    # Perform KMeans Clustering
    kmeans_cluster = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans_labels = kmeans_cluster.fit_predict(embeddings)

    # Add cluster labels to the dataframe
    df[f'agg_cluster_labels_{n_clusters}'] = agg_labels
    df[f'kmeans_cluster_labels_{n_clusters}'] = kmeans_labels

    # Calculate Silhouette Scores for both clustering methods
    silhouette_agg = silhouette_score(embeddings, agg_labels)
    silhouette_kmeans = silhouette_score(embeddings, kmeans_labels)

    # Calculate Davies-Bouldin Scores for both clustering methods
    davies_bouldin_agg = davies_bouldin_score(embeddings, agg_labels)
    davies_bouldin_kmeans = davies_bouldin_score(embeddings, kmeans_labels)

    # Calculate Adjusted Rand Index to compare the clustering results
    ari_score = adjusted_rand_score(agg_labels, kmeans_labels)

    # Prepare a dictionary with the evaluation metrics
    result_row = {
        "n_clusters": n_clusters,
        "Silhouette Score (Agglomerative)": silhouette_agg,
        "Silhouette Score (KMeans)": silhouette_kmeans,
        "Davies-Bouldin Score (Agglomerative)": davies_bouldin_agg,
        "Davies-Bouldin Score (KMeans)": davies_bouldin_kmeans,
        "Adjusted Rand Index": ari_score
    }

    # Convert result_row to a DataFrame and append to the results dataframe
    df_results = pd.concat([df_results, pd.DataFrame([result_row])], ignore_index=True)

# After the loop, print the results and save them to a CSV file
print("Clustering Results:")
print(df_results)

# Save the results to a CSV file
df_results.to_csv('clustering_results.csv', index=False)


Batches:   0%|          | 0/48 [00:00<?, ?it/s]


Clustering with n_clusters = 22


  df_results = pd.concat([df_results, pd.DataFrame([result_row])], ignore_index=True)



Clustering with n_clusters = 150

Clustering with n_clusters = 152
Clustering Results:
  n_clusters  Silhouette Score (Agglomerative)  Silhouette Score (KMeans)  \
0         22                          0.050387                   0.059967   
1        150                          0.068345                   0.051251   
2        152                          0.068916                   0.052046   

   Davies-Bouldin Score (Agglomerative)  Davies-Bouldin Score (KMeans)  \
0                              3.109126                       2.966231   
1                              2.054704                       2.170459   
2                              2.041975                       2.162148   

   Adjusted Rand Index  
0             0.336111  
1             0.360551  
2             0.361122  
