6. Exploring Long Tail

Investigate the "long tail" of the dataset: How many movies receive very few ratings?

What are the characteristics of these less-rated movies compared to popular ones?

In [1]:
#Solution

# File paths
movies_file = r'C:\Users\Admin\Documents\ml-1m[1]\ml-1m\movies.dat'
ratings_file = r'C:\Users\Admin\Documents\ml-1m[1]\ml-1m\ratings.dat'

# Step 1: Parse movies.dat to create a mapping of movie IDs to details
movie_details = {}  # {movie_id: (title, release_year, genres)}

with open(movies_file, 'r', encoding='latin-1') as movies:
    for line in movies:
        line = line.strip()
        movie_id, title, genres = line.split('::')
        release_year = int(title.strip()[-5:-1])  # Extract release year from title
        movie_details[int(movie_id)] = (title, release_year, genres)

# Step 2: Parse ratings.dat and count ratings for each movie
movie_ratings_count = {}  # {movie_id: count}

with open(ratings_file, 'r') as ratings:
    for line in ratings:
        line = line.strip()
        user_id, movie_id, rating, timestamp = map(int, line.split('::'))

        if movie_id not in movie_ratings_count:
            movie_ratings_count[movie_id] = 0
        movie_ratings_count[movie_id] += 1

# Step 3: Categorize movies by rating count
low_rating_threshold = 10  # Define the threshold for "few ratings"
low_rated_movies = []
popular_movies = []

for movie_id, count in movie_ratings_count.items():
    if count < low_rating_threshold:
        low_rated_movies.append(movie_id)
    else:
        popular_movies.append(movie_id)

# Step 4: Analyze characteristics of less-rated and popular movies
def analyze_movie_characteristics(movie_ids):
    genre_count = {}
    year_count = {}
    total_movies = len(movie_ids)

    for movie_id in movie_ids:
        title, release_year, genres = movie_details[movie_id]
        # Count genres
        for genre in genres.split('|'):
            if genre not in genre_count:
                genre_count[genre] = 0
            genre_count[genre] += 1
        # Count release years
        if release_year not in year_count:
            year_count[release_year] = 0
        year_count[release_year] += 1

    return genre_count, year_count, total_movies

# Analyze less-rated movies
low_rated_genres, low_rated_years, total_low_rated = analyze_movie_characteristics(low_rated_movies)

# Analyze popular movies
popular_genres, popular_years, total_popular = analyze_movie_characteristics(popular_movies)

# Step 5: Output Results
print(f"Total Movies: {len(movie_details)}")
print(f"Movies with Fewer than {low_rating_threshold} Ratings: {total_low_rated}")
print(f"Movies with {low_rating_threshold} or More Ratings: {total_popular}")

print("\nCharacteristics of Less-Rated Movies:")
print("Top Genres:")
for genre, count in sorted(low_rated_genres.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{genre}: {count} movies")
print("Top Release Years:")
for year, count in sorted(low_rated_years.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{year}: {count} movies")

print("\nCharacteristics of Popular Movies:")
print("Top Genres:")
for genre, count in sorted(popular_genres.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{genre}: {count} movies")
print("Top Release Years:")
for year, count in sorted(popular_years.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{year}: {count} movies")

Total Movies: 3883
Movies with Fewer than 10 Ratings: 446
Movies with 10 or More Ratings: 3260

Characteristics of Less-Rated Movies:
Top Genres:
Drama: 236 movies
Comedy: 109 movies
Documentary: 39 movies
Thriller: 32 movies
Romance: 29 movies
Top Release Years:
1995: 59 movies
1998: 56 movies
1996: 51 movies
1997: 49 movies
1994: 37 movies

Characteristics of Popular Movies:
Top Genres:
Drama: 1257 movies
Comedy: 1054 movies
Action: 470 movies
Thriller: 453 movies
Romance: 430 movies
Top Release Years:
1996: 261 movies
1998: 261 movies
1997: 255 movies
1995: 253 movies
1999: 243 movies
