In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from node2vec import Node2Vec
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
file_path = "playstore-analysis.csv"  
df = pd.read_csv(file_path)
df.drop_duplicates(inplace=True)
df['Rating'].fillna(0, inplace=True)
df['Reviews'].fillna(0, inplace=True)
df.dropna(inplace=True)

In [3]:
for col in ['Installs', 'Price']:
    df[col] = df[col].astype(str).str.replace("[+,\$]", "", regex=True)
df["Installs"] = pd.to_numeric(df["Installs"], errors="coerce")
df["Reviews"] = pd.to_numeric(df["Reviews"], errors="coerce")
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")

In [4]:
df_sample = df.sample(n=500, random_state=42)
df_sample.reset_index(drop=True, inplace=True)
df_sample.to_csv("sampled_data.csv", index=False)
print("Sampled data saved successfully!")

Sampled data saved successfully!


In [5]:
similarity_measures = {
    "category_installs":["Category","Installs"],
    "category_rating": ["Category", "Rating"],
    "category_size": ["Category", "Size"],
    "category_reviews": ["Category", "Reviews"]
}


In [6]:
for model_name, features in similarity_measures.items():
    G = nx.Graph()

    for idx, row in df_sample.iterrows():
        G.add_node(idx, name=row["App"], category=row["Category"], **{feat: row[feat] for feat in features})
    for i in range(len(df_sample)):
        for j in range(i + 1, len(df_sample)):
            similarity_score = sum(
                1 if df_sample.loc[i, feat] == df_sample.loc[j, feat] else 0 for feat in features
            ) / len(features)

            if similarity_score > 0.2:
                G.add_edge(i, j, weight=similarity_score)

    print(f"Graph for {model_name} created with {len(G.nodes())} nodes and {len(G.edges())} edges.")
    node2vec = Node2Vec(G, dimensions=32, walk_length=10, num_walks=200, workers=4)
    model = node2vec.fit(window=5, min_count=1, batch_words=4)
    pickle.dump(model, open(f"{model_name}.pkl", "wb"))
    nx.write_graphml(G, f"{model_name}.graphml")

    print(f"Model and graph for {model_name} saved.")

Graph for category_installs created with 500 nodes and 19365 edges.


Computing transition probabilities:   0%|          | 0/500 [00:00<?, ?it/s]

Model and graph for category_installs saved.
Graph for category_rating created with 500 nodes and 17344 edges.


Computing transition probabilities:   0%|          | 0/500 [00:00<?, ?it/s]

Model and graph for category_rating saved.
Graph for category_size created with 500 nodes and 12198 edges.


Computing transition probabilities:   0%|          | 0/500 [00:00<?, ?it/s]

Model and graph for category_size saved.
Graph for category_reviews created with 500 nodes and 9865 edges.


Computing transition probabilities:   0%|          | 0/500 [00:00<?, ?it/s]

Model and graph for category_reviews saved.


In [None]:
combined_features = ["Category","Installs" ,"Rating", "Size", "Reviews"]


G_combined = nx.Graph()
for idx, row in df_sample.iterrows():
    G_combined.add_node(idx, name=row["App"], category=row["Category"], 
                        **{feat: row[feat] for feat in combined_features})
for i in range(len(df_sample)):
    for j in range(i + 1, len(df_sample)):
        similarity_score = sum(
            1 if df_sample.loc[i, feat] == df_sample.loc[j, feat] else 0 for feat in combined_features
        ) / len(combined_features)

        if similarity_score > 0.2:  
            G_combined.add_edge(i, j, weight=similarity_score)

print(f"Graph for Combined Model created with {len(G_combined.nodes())} nodes and {len(G_combined.edges())} edges.")
node2vec_combined = Node2Vec(G_combined, dimensions=32, walk_length=10, num_walks=200, workers=4)
model_combined = node2vec_combined.fit(window=5, min_count=1, batch_words=4)
pickle.dump(model_combined, open("Combined.pkl", "wb"))
nx.write_graphml(G_combined, "Combined.graphml")

print("Combined model and graph saved successfully!")

Graph for Combined Model created with 500 nodes and 3597 edges.


Computing transition probabilities:   0%|          | 0/500 [00:00<?, ?it/s]

Combined model and graph saved successfully!
