In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity  # Still useful for small datasets or testing
import pickle
 # FAISS for efficient similarity search
get_ipython().system('pip install scikit-learn')
get_ipython().system('pip install faiss-cpu') 


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Load the dataset
content_data = pd.read_csv('SocialMediaUsersDataset.csv')

In [3]:
print(content_data)

       UserID                Name  Gender         DOB  \
0           1       Jesse Lawhorn  Female  1958-10-15   
1           2         Stacy Payne  Female  2004-07-21   
2           3  Katrina Nicewander  Female  2000-02-07   
3           4      Eric Yarbrough    Male  1985-04-14   
4           5       Daniel Adkins  Female  1955-09-18   
...       ...                 ...     ...         ...   
99995   99996      Lionel Denault  Female  1983-07-31   
99996   99997     Margie Mieszala  Female  1954-05-20   
99997   99998       Joan Mercedes    Male  1975-06-06   
99998   99999        Marvin Massa  Female  1959-11-16   
99999  100000          Josh Young  Female  1988-07-29   

                                               Interests  \
0                'Movies', 'Fashion', 'Fashion', 'Books'   
1      'Gaming', 'Finance and investments', 'Outdoor ...   
2        'DIY and crafts', 'Music', 'Science', 'Fashion'   
3           'Outdoor activities', 'Cars and automobiles'   
4              

In [4]:
# Check the actual column names to ensure they match
print(content_data.columns)

Index(['UserID', 'Name', 'Gender', 'DOB', 'Interests', 'City', 'Country'], dtype='object')


In [5]:
# Select relevant columns: 'Gender', 'DOB', 'Interests', 'City', 'Country'
content_data = content_data[['Gender', 'DOB', 'Interests', 'City', 'Country']]

In [6]:
# Remove any rows with missing interests
content_data.dropna(subset=['Interests'], inplace=True)


In [7]:
# Assuming you have already loaded and preprocessed the content_data DataFrame

# Preprocess the 'Interests' column
def preprocess_interests(interests):
    # Remove single quotes and split by comma
    return interests.replace("'", "").split(', ') if isinstance(interests, str) else []

# Apply preprocessing to the 'Interests' column
content_data['Interests'] = content_data['Interests'].apply(preprocess_interests)

# Create a 'tags' column by joining interests into a single string
content_data['tags'] = content_data['Interests'].apply(lambda x: ' '.join(x))

# Convert the tags to lowercase for uniformity
content_data['tags'] = content_data['tags'].apply(lambda x: x.lower())

# Create a new DataFrame with relevant columns for recommendation
new_df = content_data[['Gender', 'DOB', 'City', 'Country', 'tags']]

# Preview the new DataFrame
print(new_df.head())


   Gender         DOB         City    Country  \
0  Female  1958-10-15      Sibolga  Indonesia   
1  Female  2004-07-21     Al Abyār      Libya   
2  Female  2000-02-07  Wādī as Sīr     Jordan   
3    Male  1985-04-14       Matera      Italy   
4  Female  1955-09-18      Biruaca  Venezuela   

                                                tags  
0                       movies fashion fashion books  
1  gaming finance and investments outdoor activit...  
2               diy and crafts music science fashion  
3            outdoor activities cars and automobiles  
4                                   politics history  


In [8]:
# Assuming 'content_data' is your DataFrame for the social media users
# View the shape of the social media dataset
print("Shape of the social media dataset:", content_data.shape)

# View the first few rows of the social media dataset
print(content_data.head())


Shape of the social media dataset: (100000, 6)
   Gender         DOB                                          Interests  \
0  Female  1958-10-15                  [Movies, Fashion, Fashion, Books]   
1  Female  2004-07-21  [Gaming, Finance and investments, Outdoor acti...   
2  Female  2000-02-07          [DIY and crafts, Music, Science, Fashion]   
3    Male  1985-04-14         [Outdoor activities, Cars and automobiles]   
4  Female  1955-09-18                                [Politics, History]   

          City    Country                                               tags  
0      Sibolga  Indonesia                       movies fashion fashion books  
1     Al Abyār      Libya  gaming finance and investments outdoor activit...  
2  Wādī as Sīr     Jordan               diy and crafts music science fashion  
3       Matera      Italy            outdoor activities cars and automobiles  
4      Biruaca  Venezuela                                   politics history  


In [9]:
# Value counts for the 'Gender' column (or any relevant column you want to analyze)
print(content_data['Gender'].value_counts())

# Get information about the dataset
content_data.info()

# Select specific columns from the content_data dataset
print(content_data[['Gender', 'DOB', 'City', 'Country', 'Interests']].head())

# Check for missing values
print(content_data.isnull().sum())


Gender
Male      50069
Female    49931
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Gender     100000 non-null  object
 1   DOB        100000 non-null  object
 2   Interests  100000 non-null  object
 3   City       100000 non-null  object
 4   Country    100000 non-null  object
 5   tags       100000 non-null  object
dtypes: object(6)
memory usage: 4.6+ MB
   Gender         DOB         City    Country  \
0  Female  1958-10-15      Sibolga  Indonesia   
1  Female  2004-07-21     Al Abyār      Libya   
2  Female  2000-02-07  Wādī as Sīr     Jordan   
3    Male  1985-04-14       Matera      Italy   
4  Female  1955-09-18      Biruaca  Venezuela   

                                           Interests  
0                  [Movies, Fashion, Fashion, Books]  
1  [Gaming, Finance and investments, Outdoor acti...  
2     

In [10]:
# Drop rows with missing values
content_data.dropna(inplace=True)

In [11]:
# View the 'Interests' column of the first row
print(content_data.iloc[0]['Interests'])



['Movies', 'Fashion', 'Fashion', 'Books']


In [12]:
import ast  # Make sure to import ast at the beginning of your script

# Define the function to convert the 'Interests' or other relevant columns
def convert(obj):
    L = []
    # Safely evaluate the string representation of the list
    try:
        # Convert the string to a list
        parsed_list = ast.literal_eval(obj)  # Safely parses the string to list/dict
        # Check if parsed_list is indeed a list
        if isinstance(parsed_list, list):
            for item in parsed_list:  # Iterate through the list
                L.append(item.strip())  # Strip any leading/trailing spaces and append
    except (ValueError, SyntaxError):
        return []  # Return an empty list if there is any issue
    return L


In [13]:
# Apply the convert function to the 'Interests' column
content_data['Interests'] = content_data['Interests'].apply(convert)

# If there's another relevant column to convert (e.g., 'keywords'), do the same
# content_data['keywords'] = content_data['keywords'].apply(convert)

# Preview the updated DataFrame
print(content_data[['Interests']].head())


  Interests
0        []
1        []
2        []
3        []
4        []


In [14]:
# View the transformed 'Interests' column (or other relevant columns)
print(content_data[['Gender', 'DOB', 'City', 'Country', 'Interests']].head())


   Gender         DOB         City    Country Interests
0  Female  1958-10-15      Sibolga  Indonesia        []
1  Female  2004-07-21     Al Abyār      Libya        []
2  Female  2000-02-07  Wādī as Sīr     Jordan        []
3    Male  1985-04-14       Matera      Italy        []
4  Female  1955-09-18      Biruaca  Venezuela        []


In [15]:
# Remove spaces in 'Interests'
content_data['Interests'] = content_data['Interests'].apply(lambda x: [i.replace(" ", "") for i in x])

# View the transformed 'Interests' column
print(content_data[['Gender', 'DOB', 'City', 'Country', 'Interests']].head())


   Gender         DOB         City    Country Interests
0  Female  1958-10-15      Sibolga  Indonesia        []
1  Female  2004-07-21     Al Abyār      Libya        []
2  Female  2000-02-07  Wādī as Sīr     Jordan        []
3    Male  1985-04-14       Matera      Italy        []
4  Female  1955-09-18      Biruaca  Venezuela        []


In [16]:
# View the transformed 'Interests' column
print(content_data[['Gender', 'DOB', 'City', 'Country', 'Interests']].head())


   Gender         DOB         City    Country Interests
0  Female  1958-10-15      Sibolga  Indonesia        []
1  Female  2004-07-21     Al Abyār      Libya        []
2  Female  2000-02-07  Wādī as Sīr     Jordan        []
3    Male  1985-04-14       Matera      Italy        []
4  Female  1955-09-18      Biruaca  Venezuela        []


In [17]:

# Create the 'tags' column by combining 'Interests', 'City', and 'Country'
# Ensure all parts are concatenated properly
content_data['tags'] = content_data['Interests'] + content_data['City'].apply(lambda x: [x]) + content_data['Country'].apply(lambda x: [x])

# View the first few rows of the dataset with the new 'tags' column
print(content_data[['Gender', 'DOB', 'City', 'Country', 'tags']].head())


   Gender         DOB         City    Country                   tags
0  Female  1958-10-15      Sibolga  Indonesia   [Sibolga, Indonesia]
1  Female  2004-07-21     Al Abyār      Libya      [Al Abyār, Libya]
2  Female  2000-02-07  Wādī as Sīr     Jordan  [Wādī as Sīr, Jordan]
3    Male  1985-04-14       Matera      Italy        [Matera, Italy]
4  Female  1955-09-18      Biruaca  Venezuela   [Biruaca, Venezuela]


In [18]:
# Optional: Join tags into a single string per user (similar to how it was done for movies)
content_data['tags'] = content_data['tags'].apply(lambda x: ' '.join(x))  # Join the list into a single string

# View the first few rows of the dataset with the new 'tags' column
print(content_data[['Gender', 'DOB', 'City', 'Country', 'tags']].head())


   Gender         DOB         City    Country                tags
0  Female  1958-10-15      Sibolga  Indonesia   Sibolga Indonesia
1  Female  2004-07-21     Al Abyār      Libya      Al Abyār Libya
2  Female  2000-02-07  Wādī as Sīr     Jordan  Wādī as Sīr Jordan
3    Male  1985-04-14       Matera      Italy        Matera Italy
4  Female  1955-09-18      Biruaca  Venezuela   Biruaca Venezuela


In [19]:
# View the first few rows of the dataset with the new 'tags' column
print(content_data[['Interests', 'tags','Country']].head())


  Interests                tags    Country
0        []   Sibolga Indonesia  Indonesia
1        []      Al Abyār Libya      Libya
2        []  Wādī as Sīr Jordan     Jordan
3        []        Matera Italy      Italy
4        []   Biruaca Venezuela  Venezuela


In [20]:
new_df = content_data[['Gender', 'Interests', 'tags','Country']]

In [21]:
print(content_data.columns)


Index(['Gender', 'DOB', 'Interests', 'City', 'Country', 'tags'], dtype='object')


In [22]:
print(new_df.head())

   Gender Interests                tags    Country
0  Female        []   Sibolga Indonesia  Indonesia
1  Female        []      Al Abyār Libya      Libya
2  Female        []  Wādī as Sīr Jordan     Jordan
3    Male        []        Matera Italy      Italy
4  Female        []   Biruaca Venezuela  Venezuela


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with max 5000 features and stop words in English
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the 'tags' column of new_df to convert text data to numerical data
vectorized_data = cv.fit_transform(new_df['tags']).toarray()

# View the shape of the resulting matrix
print(vectorized_data.shape)

# Optional: View the first few rows of the vectorized data
print(vectorized_data[5:])


(100000, 5000)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [24]:
# Fit and transform the 'tags' column of new_df to convert text data into numerical format (vectorized)
vectorized_data = cv.fit_transform(new_df['tags']).toarray()

# Check the shape of the vectorized data (rows = number of items, columns = features)
print(vectorized_data.shape)

# Optional: View the first few rows of the vectorized data for inspection
print(vectorized_data)


(100000, 5000)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [25]:
print(vectorized_data[0])
print(vectorized_data[1])
print(vectorized_data[2])lgorithm='brute')


[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [26]:
print(cv.get_feature_names_out())


['02' '03' '05' ... 'žďár' 'ḩalḩūl' 'ḩawātah']


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
from sklearn.neighbors import NearestNeighbors

# Initialize the NearestNeighbors model with cosine similarity
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model on the vectorized data
nn_model.fit(vectorized_data)

# Find the top 5 nearest neighbors (or any number you want) for a specific data point (e.g., first user)
distances, indices = nn_model.kneighbors([vectorized_data[0]], n_neighbors=5)

# Print the indices of the similar users or content and their distances
print("Indices of similar items:", indices)
print("Distances from the target item:", distances)


Indices of similar items: [[61883 98725 70758 53374 26772]]
Distances from the target item: [[0. 0. 0. 0. 0.]]


In [29]:
import faiss
# Assume the dataset is already loaded and vectorized
# Create vectorized data (assuming 'tags' column is processed already)
cv = CountVectorizer(max_features=5000, stop_words='english')
vectorized_data = cv.fit_transform(new_df['tags']).toarray()

# Normalize the vectorized data to have unit norm (L2 norm)
vectorized_data = vectorized_data.astype('float32')
faiss.normalize_L2(vectorized_data)

# Initialize a FAISS index for inner product search
dimension = vectorized_data.shape[1]  # Number of features (5000 in this case)
index = faiss.IndexFlatIP(dimension)  # Inner product search

# Add vectorized data to the FAISS index
index.add(vectorized_data)

# Query the FAISS index to find the 5 most similar items (including itself)
k = 6  # 1 for the item itself + 5 similar items
query_vector = vectorized_data[0].reshape(1, -1)  # Example: querying for the first item
distances, indices = index.search(query_vector, k)  # Perform the search

# Print the indices of the 5 most similar items (excluding the item itself)
print("Top 5 similar items:", indices[0][1:])  # Exclude the first result (itself)


Top 5 similar items: [214 197  91  90   0]


In [30]:
# Load the dataset
content_data = pd.read_csv('SocialMediaUsersDataset.csv')

# Check the structure of the Interests column
print("Initial Interests Data:")
print(content_data['Interests'].head())

# Define a function to convert string representation of lists into actual lists
def convert(obj):
    L = []
    if isinstance(obj, str):  # Ensure obj is a string
        try:
            # Safely evaluate the string representation of the list of dictionaries
            items = ast.literal_eval(obj)
            if isinstance(items, list):  # Check if items is a list
                for i in items:
                    if isinstance(i, dict) and 'name' in i:  # Ensure each item is a dict with 'name'
                        L.append(i['name'])
        except (ValueError, SyntaxError):
            return []  # Return an empty list if there is any issue
    return L

# Apply the convert function to the relevant columns
content_data['Interests'] = content_data['Interests'].apply(convert)

# Create a new tags column by combining relevant information
content_data['tags'] = content_data['Interests'].apply(lambda x: ' '.join(x))

# Print the processed data
print("Processed Interests Data:")
print(content_data['Interests'].head())
print("Tags Data:")
print(content_data['tags'].head())

# Initialize the CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
#vectorized_data = cv.fit_transform(content_data['tags']).toarray()

# Create a Faiss index
dimension = vectorized_data.shape[1]
index = faiss.IndexFlatIP(dimension)  # Use Inner Product
index.add(np.array(vectorized_data).astype('float32'))  # Add the vectors to the index

# Recommendation function
def recommend(item_index, num_recommendations=5):
    # Get the vector of the item we want to find recommendations for
    item_vector = vectorized_data[item_index].reshape(1, -1).astype('float32')

    # Search for the nearest neighbors in the index
    distances, indices = index.search(item_vector, num_recommendations + 1)  # +1 to exclude the item itself

    # Print distances and indices for debugging
    print("Distances:", distances)
    print("Indices:", indices)

    # Get the recommended item indices (excluding the first one which is the item itself)
    recommended_indices = indices[0][1:]

    # Get the names (or titles) of the recommended items
    recommended_items = content_data.iloc[recommended_indices]['Interests'].values

    return recommended_items

# Example usage
item_index = 0  # Change this to the desired item's index for recommendations
recommended_items = recommend(item_index=item_index, num_recommendations=5)  # Change num_recommendations as needed
print("Recommended Items:", recommended_items)


Initial Interests Data:
0              'Movies', 'Fashion', 'Fashion', 'Books'
1    'Gaming', 'Finance and investments', 'Outdoor ...
2      'DIY and crafts', 'Music', 'Science', 'Fashion'
3         'Outdoor activities', 'Cars and automobiles'
4                                'Politics', 'History'
Name: Interests, dtype: object
Processed Interests Data:
0    []
1    []
2    []
3    []
4    []
Name: Interests, dtype: object
Tags Data:
0    
1    
2    
3    
4    
Name: tags, dtype: object
Distances: [[1. 1. 1. 1. 1. 1.]]
Indices: [[279 214 197  91  90   0]]
Recommended Items: [list([]) list([]) list([]) list([]) list([])]


In [31]:


# Load the dataset
content_data = pd.read_csv('SocialMediaUsersDataset.csv')

# Assuming you've processed content_data as per your earlier code
# Here is a quick outline of how it might look
# Define a function to convert string representation of lists into actual lists
def convert(obj):
    L = []
    if isinstance(obj, str):  # Ensure obj is a string
        try:
            items = ast.literal_eval(obj)
            if isinstance(items, list):  # Check if items is a list
                for i in items:
                    if isinstance(i, dict) and 'name' in i:
                        L.append(i['name'])
        except (ValueError, SyntaxError):
            return []  # Return an empty list if there is any issue
    return L

# Apply the convert function to the relevant columns
content_data['Interests'] = content_data['Interests'].apply(convert)

# Create a new tags column by combining relevant information
content_data['tags'] = content_data['Interests'].apply(lambda x: ' '.join(x))

# Save the processed DataFrame `content_data` as a pickle file
with open('social_media_recommendations.pkl', 'wb') as f:
    pickle.dump(content_data, f)

print("DataFrame saved as 'social_media_recommendations.pkl'")


DataFrame saved as 'social_media_recommendations.pkl'


In [32]:
# Assuming content_data is already defined and has an 'Interests' column
interests = content_data['Interests'].values
print("Interests:")
print(interests)

# To check the first few interests
print("First 5 Interests:")
print(interests[:5])  # Prints the first 5 interests


Interests:
[list([]) list([]) list([]) ... list([]) list([]) list([])]
First 5 Interests:
[list([]) list([]) list([]) list([]) list([])]


In [33]:


# Save the DataFrame `content_data` as a dictionary in a pickle file
pickle.dump(content_data.to_dict(), open('social_media_dict.pkl', 'wb'))


In [35]:
# Recommendation function
def recommend(item_index, num_recommendations=5):
    # Get the vector of the item we want to find recommendations for
    item_vector = vectorized_data[item_index].reshape(1, -1).astype('float32')

    # Search for the nearest neighbors in the index
    distances, indices = index.search(item_vector, num_recommendations + 1)  # +1 to exclude the item itself

    # Get the recommended item indices (excluding the first one which is the item itself)
    recommended_indices = indices[0][1:]

    # Get the names (or titles) of the recommended items
    recommended_items = content_data.iloc[recommended_indices]['Interests'].values  # Adjust if necessary

    return recommended_items

# Example usage
recommended_items = recommend(item_index=0, num_recommendations=5)  # Change item_index to the desired item's index
print("Recommended Items:", recommended_items)

# Save the content data DataFrame
pickle.dump(content_data.to_dict(), open('content_data_dict.pkl', 'wb'))

# Save the vectorized data
pickle.dump(vectorized_data, open('vectorized_data.pkl', 'wb'))

# Save the Faiss index
faiss.write_index(index, 'faiss_index.index')


Recommended Items: [list([]) list([]) list([]) list([]) list([])]
