3.User Engagement Analysis

Identify the most active users (profession) based on the number of ratings they’ve given.

Analyze the relationship between user demographic attributes (age, gender, occupation) and their movie preferences or rating patterns.

In [1]:
#Solution

# File paths
movies_file = r'C:\Users\Admin\Documents\ml-1m[1]\ml-1m\movies.dat'
ratings_file = r'C:\Users\Admin\Documents\ml-1m[1]\ml-1m\ratings.dat'
users_file = r'C:\Users\Admin\Documents\ml-1m[1]\ml-1m\users.dat'

# Step 1: Parse users.dat to create a mapping of user IDs to demographic attributes
user_demographics = {}  # {user_id: (gender, age, occupation)}

with open(users_file, 'r', encoding='latin-1') as users:
    for line in users:
        line = line.strip()
        user_id, gender, age, occupation, zip_code = line.split('::')
        user_demographics[int(user_id)] = (gender, int(age), int(occupation))

# Step 2: Parse movies.dat to create a mapping of movie IDs to genres
movie_genres = {}  # {movie_id: [genres]}

with open(movies_file, 'r', encoding='latin-1') as movies:
    for line in movies:
        line = line.strip()
        movie_id, title, genres = line.split('::')
        genres_list = genres.split('|')
        movie_genres[int(movie_id)] = genres_list

# Step 3: Parse ratings.dat to calculate engagement by demographics
occupation_ratings_count = {}  # {occupation: total_ratings_count}
gender_rating_distribution = {"M": 0, "F": 0}  # {gender: total_ratings_count}
age_rating_distribution = {}  # {age_group: total_ratings_count}
genre_preferences = {}  # {occupation: {genre: count}}

with open(ratings_file, 'r') as ratings:
    for line in ratings:
        line = line.strip()
        user_id, movie_id, rating, timestamp = map(int, line.split('::'))

        # User demographic information
        if user_id in user_demographics:
            gender, age, occupation = user_demographics[user_id]

            # Update occupation ratings count
            if occupation not in occupation_ratings_count:
                occupation_ratings_count[occupation] = 0
            occupation_ratings_count[occupation] += 1

            # Update gender ratings distribution
            gender_rating_distribution[gender] += 1

            # Update age group ratings count
            age_group = age // 10 * 10  # Group ages by decade
            if age_group not in age_rating_distribution:
                age_rating_distribution[age_group] = 0
            age_rating_distribution[age_group] += 1

            # Update genre preferences for this occupation
            if movie_id in movie_genres:
                if occupation not in genre_preferences:
                    genre_preferences[occupation] = {}
                for genre in movie_genres[movie_id]:
                    if genre not in genre_preferences[occupation]:
                        genre_preferences[occupation][genre] = 0
                    genre_preferences[occupation][genre] += 1

# Step 4: Identify the most active professions
most_active_professions = sorted(occupation_ratings_count.items(), key=lambda x: x[1], reverse=True)

# Step 5: Display results
print("Most Active Professions:")
for occupation, count in most_active_professions:
    print(f"Profession {occupation}: {count} ratings")

print("\nGender Rating Distribution:")
for gender, count in gender_rating_distribution.items():
    print(f"{gender}: {count} ratings")

print("\nAge Group Rating Distribution:")
for age_group, count in sorted(age_rating_distribution.items()):
    print(f"{age_group}s: {count} ratings")

print("\nGenre Preferences by Profession:")
for occupation, genres in genre_preferences.items():
    print(f"Profession {occupation}:")
    sorted_genres = sorted(genres.items(), key=lambda x: x[1], reverse=True)
    for genre, count in sorted_genres[:5]:  # Top 5 genres for each profession
        print(f"  {genre}: {count} ratings")

Most Active Professions:
Profession 4: 131032 ratings
Profession 0: 130499 ratings
Profession 7: 105425 ratings
Profession 1: 85351 ratings
Profession 17: 72816 ratings
Profession 20: 60397 ratings
Profession 12: 57214 ratings
Profession 2: 50068 ratings
Profession 14: 49109 ratings
Profession 16: 46021 ratings
Profession 6: 37205 ratings
Profession 3: 31623 ratings
Profession 10: 23290 ratings
Profession 15: 22951 ratings
Profession 5: 21850 ratings
Profession 11: 20563 ratings
Profession 19: 14904 ratings
Profession 13: 13754 ratings
Profession 18: 12086 ratings
Profession 9: 11345 ratings
Profession 8: 2706 ratings

Gender Rating Distribution:
M: 753769 ratings
F: 246440 ratings

Age Group Rating Distribution:
0s: 27211 ratings
10s: 183536 ratings
20s: 395556 ratings
30s: 199003 ratings
40s: 83633 ratings
50s: 111270 ratings

Genre Preferences by Profession:
Profession 10:
  Comedy: 9465 ratings
  Action: 6067 ratings
  Drama: 6000 ratings
  Thriller: 4212 ratings
  Sci-Fi: 3932 rat