In [4]:
import pandas as pd

df_full = pd.read_csv('data.csv')
df_full.head(2)

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l,Summary,Language,Category,city,state,country
0,0,2,"stockton, california, usa",18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,Provides an introduction to classical myths pl...,en,['Social Science'],stockton,california,usa
1,1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada


In [5]:
# 1031175 is really heavy to proccess and showcase a demo so we are considering 10 % of it
df_full.shape

(1031175, 19)

In [6]:
random_seed = 42
df = df_full.sample(n=100000, random_state=random_seed)
df.shape

(100000, 19)

### Drop unnecessary columns like img_s, img_m, img_l 

In [7]:
df = df.drop(columns=['img_s', 'img_m', 'img_l', 'Unnamed: 0', 'Summary'])
df.head()

Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Language,Category,city,state,country
392141,262169,"walled lake, michigan, usa",15.0,0345388216,0,Dragonseye (Dragonriders of Pern Series),Anne McCaffrey,1997.0,Ballantine Books,en,['Fiction'],walled lake,michigan,usa
120888,73651,"grapevine, texas, usa",34.7439,0553375407,0,Ishmael: An Adventure of the Mind and Spirit,Daniel Quinn,1995.0,Bantam,en,['Fiction'],grapevine,texas,usa
472434,217565,"siegburg, nordrhein-westfalen, germany",34.7439,340412815X,0,Die BrÃ?Â¼cken der Freiheit.,Ken Follett,1998.0,LÃ?Â¼bbe,de,['Highlands (Scotland)'],siegburg,nordrhein-westfalen,germany
206129,185233,"winnemucca, nevada, usa",31.0,0452278902,5,The Green Mile: The Complete Serial Novel,Stephen King,1997.0,Plume Books,9,9,winnemucca,nevada,usa
553117,69697,"west sacramento, california, usa",24.0,037327145X,0,Whitelaw'S Wedding (The Protectors) (Silhouett...,Beverly Barton,2001.0,Silhouette,en,['Fiction'],west sacramento,california,usa


### Check for NaN values in each column

In [8]:
# for col in df.columns:
df.isnull().sum()

user_id                   0
location                  0
age                       0
isbn                      0
rating                    0
book_title                0
book_author               0
year_of_publication       0
publisher                 0
Language                  0
Category                  0
city                   1309
state                  2193
country                3445
dtype: int64

In [9]:
# Later the RAG app can analyze on missing data too
df.fillna("Unknown", inplace=True)

In [10]:
df.isnull().sum()

user_id                0
location               0
age                    0
isbn                   0
rating                 0
book_title             0
book_author            0
year_of_publication    0
publisher              0
Language               0
Category               0
city                   0
state                  0
country                0
dtype: int64

### Fixing data types of age and year_of_publication

In [11]:
print(df.dtypes)

user_id                  int64
location                object
age                    float64
isbn                    object
rating                   int64
book_title              object
book_author             object
year_of_publication    float64
publisher               object
Language                object
Category                object
city                    object
state                   object
country                 object
dtype: object


In [12]:
convert_dict = {'age': int,
                'year_of_publication': int
                }
 
df = df.astype(convert_dict)
print(df.dtypes)

user_id                 int64
location               object
age                     int32
isbn                   object
rating                  int64
book_title             object
book_author            object
year_of_publication     int32
publisher              object
Language               object
Category               object
city                   object
state                  object
country                object
dtype: object


In [13]:
df.head(3)

Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Language,Category,city,state,country
392141,262169,"walled lake, michigan, usa",15,0345388216,0,Dragonseye (Dragonriders of Pern Series),Anne McCaffrey,1997,Ballantine Books,en,['Fiction'],walled lake,michigan,usa
120888,73651,"grapevine, texas, usa",34,0553375407,0,Ishmael: An Adventure of the Mind and Spirit,Daniel Quinn,1995,Bantam,en,['Fiction'],grapevine,texas,usa
472434,217565,"siegburg, nordrhein-westfalen, germany",34,340412815X,0,Die BrÃ?Â¼cken der Freiheit.,Ken Follett,1998,LÃ?Â¼bbe,de,['Highlands (Scotland)'],siegburg,nordrhein-westfalen,germany


In [15]:
df['location'].replace('n/a, n/a, n/a', 'Unknown', inplace=True)
df['Language'].replace('9', 'Unknown', inplace=True)
df['Category'].replace('9', 'Unknown', inplace=True)
df['state'].replace(',', 'Unknown', inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].replace('n/a, n/a, n/a', 'Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Language'].replace('9', 'Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Language,Category,city,state,country
392141,262169,"walled lake, michigan, usa",15,0345388216,0,Dragonseye (Dragonriders of Pern Series),Anne McCaffrey,1997,Ballantine Books,en,['Fiction'],walled lake,michigan,usa
120888,73651,"grapevine, texas, usa",34,0553375407,0,Ishmael: An Adventure of the Mind and Spirit,Daniel Quinn,1995,Bantam,en,['Fiction'],grapevine,texas,usa
472434,217565,"siegburg, nordrhein-westfalen, germany",34,340412815X,0,Die BrÃ?Â¼cken der Freiheit.,Ken Follett,1998,LÃ?Â¼bbe,de,['Highlands (Scotland)'],siegburg,nordrhein-westfalen,germany
206129,185233,"winnemucca, nevada, usa",31,0452278902,5,The Green Mile: The Complete Serial Novel,Stephen King,1997,Plume Books,Unknown,Unknown,winnemucca,nevada,usa
553117,69697,"west sacramento, california, usa",24,037327145X,0,Whitelaw'S Wedding (The Protectors) (Silhouett...,Beverly Barton,2001,Silhouette,en,['Fiction'],west sacramento,california,usa


### Save the file to .csv

In [16]:
df.to_csv('db_data.csv')