4. Rating Distribution by Demographics

Investigate how ratings vary by user demographic attributes (age, gender, occupation).

Are there specific genres preferred by certain age groups or occupations?

In [1]:
#Solution

# File paths
movies_file = r'C:\Users\Admin\Documents\ml-1m[1]\ml-1m\movies.dat'
ratings_file = r'C:\Users\Admin\Documents\ml-1m[1]\ml-1m\ratings.dat'
users_file = r'C:\Users\Admin\Documents\ml-1m[1]\ml-1m\users.dat'

# Step 1: Parse users.dat to create a mapping of user IDs to demographics
user_demographics = {}  # {user_id: (gender, age_group, occupation)}

with open(users_file, 'r', encoding='latin-1') as users:
    for line in users:
        line = line.strip()
        user_id, gender, age, occupation, zip_code = line.split('::')
        age_group = int(age) // 10 * 10  # Group ages by decade
        user_demographics[int(user_id)] = (gender, age_group, int(occupation))

# Step 2: Parse movies.dat to create a mapping of movie IDs to genres
movie_genres = {}  # {movie_id: [genres]}

with open(movies_file, 'r', encoding='latin-1') as movies:
    for line in movies:
        line = line.strip()
        movie_id, title, genres = line.split('::')
        genres_list = genres.split('|')
        movie_genres[int(movie_id)] = genres_list

# Step 3: Parse ratings.dat and calculate demographic-based rating distributions
demographic_rating_distribution = {}  # {(gender, age_group, occupation): {rating_level: count}}
demographic_genre_preferences = {}  # {(gender, age_group, occupation): {genre: count}}

with open(ratings_file, 'r') as ratings:
    for line in ratings:
        line = line.strip()
        user_id, movie_id, rating, timestamp = map(int, line.split('::'))

        # Determine rating level
        if rating == 5:
            rating_level = 'High'
        elif rating in [3, 4]:
            rating_level = 'Medium'
        else:
            rating_level = 'Low'

        # Get user demographics
        if user_id in user_demographics:
            gender, age_group, occupation = user_demographics[user_id]

            # Initialize demographic rating distribution
            key = (gender, age_group, occupation)
            if key not in demographic_rating_distribution:
                demographic_rating_distribution[key] = {'High': 0, 'Medium': 0, 'Low': 0}
            demographic_rating_distribution[key][rating_level] += 1

            # Initialize demographic genre preferences
            if movie_id in movie_genres:
                if key not in demographic_genre_preferences:
                    demographic_genre_preferences[key] = {}
                for genre in movie_genres[movie_id]:
                    if genre not in demographic_genre_preferences[key]:
                        demographic_genre_preferences[key][genre] = 0
                    demographic_genre_preferences[key][genre] += 1

# Step 4: Output results
print("Rating Distribution by Demographics:")
for demographic, ratings in demographic_rating_distribution.items():
    gender, age_group, occupation = demographic
    print(f"Gender: {gender}, Age Group: {age_group}s, Occupation: {occupation}")
    total_ratings = sum(ratings.values())
    for level, count in ratings.items():
        percentage = (count / total_ratings) * 100
        print(f"  {level}: {percentage:.2f}%")
    print()

print("Genre Preferences by Demographics:")
for demographic, genres in demographic_genre_preferences.items():
    gender, age_group, occupation = demographic
    print(f"Gender: {gender}, Age Group: {age_group}s, Occupation: {occupation}")
    sorted_genres = sorted(genres.items(), key=lambda x: x[1], reverse=True)[:5]  # Top 5 genres
    for genre, count in sorted_genres:
        print(f"  {genre}: {count} ratings")
    print()

Rating Distribution by Demographics:
Gender: F, Age Group: 0s, Occupation: 10
  High: 25.55%
  Medium: 57.02%
  Low: 17.43%

Gender: M, Age Group: 50s, Occupation: 16
  High: 22.07%
  Medium: 63.47%
  Low: 14.46%

Gender: M, Age Group: 20s, Occupation: 15
  High: 21.98%
  Medium: 60.66%
  Low: 17.35%

Gender: M, Age Group: 40s, Occupation: 7
  High: 24.44%
  Medium: 63.00%
  Low: 12.55%

Gender: M, Age Group: 20s, Occupation: 20
  High: 21.30%
  Medium: 57.54%
  Low: 21.16%

Gender: F, Age Group: 50s, Occupation: 9
  High: 29.47%
  Medium: 61.12%
  Low: 9.41%

Gender: M, Age Group: 30s, Occupation: 1
  High: 20.16%
  Medium: 60.78%
  Low: 19.05%

Gender: M, Age Group: 20s, Occupation: 12
  High: 22.64%
  Medium: 61.86%
  Low: 15.49%

Gender: M, Age Group: 20s, Occupation: 17
  High: 21.96%
  Medium: 63.02%
  Low: 15.03%

Gender: F, Age Group: 30s, Occupation: 1
  High: 22.54%
  Medium: 62.96%
  Low: 14.50%

Gender: F, Age Group: 20s, Occupation: 1
  High: 20.11%
  Medium: 63.00%
  Low: