In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import pandas as pd

filename = 'books data/BX-Books.csv'

def load_csv_with_skip_and_read(filename, encoding='latin-1', delimiter=';'):
    data = []
    skipped_rows = 0
    total_rows = 0
    
    with open(filename, 'r', encoding=encoding) as file:
        for line_number, line in enumerate(file, start=1):
            total_rows += 1
            try:
                row = line.strip().split(delimiter)
                
                # Check if the row has specific unwanted column counts
                unwanted_column_counts = [14, 13, 12, 10, 11, 10, 9]
                if len(row) in unwanted_column_counts:
                    print(f"Skipping row {line_number} with {len(row)} columns: {row}")
                    skipped_rows += 1
                else:
                    # Only append rows with other column counts to the data
                    data.append(row)
                    
            except Exception as e:
                print(f"Error in line {line_number}: {e}")
    
    # Specify column names for the desired 8 columns
    columns = ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
    
    books = pd.DataFrame(data, columns=columns)
    
    print(f"\nSkipped {skipped_rows} rows out of {total_rows} total rows.")
    return books

# Call the function to remove specific rows and read data with 8 columns
books = load_csv_with_skip_and_read(filename)


Skipping row 6 with 9 columns: ['"0393045218"', '"The Mummies of Urumchi"', '"E. J. W. Barber"', '"1999"', '"W. W. Norton &amp', ' Company"', '"http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg"', '"http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg"', '"http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg"']
Skipping row 23 with 9 columns: ['"1841721522"', '"New Vegetarian: Bold and Beautiful Recipes for Every Occasion"', '"Celia Brooks Brown"', '"2001"', '"Ryland Peters &amp', ' Small Ltd"', '"http://images.amazon.com/images/P/1841721522.01.THUMBZZZ.jpg"', '"http://images.amazon.com/images/P/1841721522.01.MZZZZZZZ.jpg"', '"http://images.amazon.com/images/P/1841721522.01.LZZZZZZZ.jpg"']
Skipping row 25 with 10 columns: ['"0061076031"', '"Mary-Kate &amp', ' Ashley Switching Goals (Mary-Kate and Ashley Starring in)"', '"Mary-Kate &amp', ' Ashley Olsen"', '"2000"', '"HarperEntertainment"', '"http://images.amazon.com/images/P/0061076031.01.THUMBZZZ.jpg"', '"ht

In [6]:
books.shape

(250013, 8)

In [7]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [9]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-L']]

In [10]:
books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,"""ISBN""","""Book-Title""","""Book-Author""","""Year-Of-Publication""","""Publisher""","""Image-URL-L"""
1,"""0195153448""","""Classical Mythology""","""Mark P. O. Morford""","""2002""","""Oxford University Press""","""http://images.amazon.com/images/P/0195153448...."


In [12]:
books.rename(columns={
    "Book-Title" :"title",
    "Book-Author":"author",
    "Years-Of-Publication":"year",
    "Publisher":"publisher",
    "Image-URL-L":"img_url"}, inplace = True)

In [13]:
books.head(2)

Unnamed: 0,ISBN,title,author,Year-Of-Publication,publisher,img_url
0,"""ISBN""","""Book-Title""","""Book-Author""","""Year-Of-Publication""","""Publisher""","""Image-URL-L"""
1,"""0195153448""","""Classical Mythology""","""Mark P. O. Morford""","""2002""","""Oxford University Press""","""http://images.amazon.com/images/P/0195153448...."


In [17]:
import csv
with open("books data/BX-Users.csv", "r", encoding="latin-1") as file:
    try:
        # Attempt to read the CSV file using pandas
        users = pd.read_csv(file, sep=";")
    except csv.Error as e:
        # Handle parsing errors
        print(f"Error parsing CSV file: {e}")

In [18]:
users.head(2)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [19]:
users.shape

(278858, 3)

In [20]:
with open("books data/BX-Book-Ratings.csv", "r", encoding="latin-1") as file:
    try:
        # Attempt to read the CSV file using pandas
        ratings = pd.read_csv(file, sep=";")
    except csv.Error as e:
        # Handle parsing errors
        print(f"Error parsing CSV file: {e}")

In [21]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [22]:
ratings.shape

(1149780, 3)

In [23]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(250013, 6)
(278858, 3)
(1149780, 3)


In [27]:
ratings.rename(columns={
    "User-ID":"user_id",
    "Book-Rating":"rating"}, inplace=True)

In [30]:
ratings.head(2)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
