Pandas Practice

In [1]:
import pandas as pd

data = {
    "cal": [420, 380, 290],
    "dur": [50,40,45]
}

df = pd.DataFrame(data)

print(df)                   #Print dataframe
print("")
print(df.loc[0])            #Print series
print("")
print(df.loc[[0]])          #Print same but as dataframe
print("")
print(df.loc[[0,1]])        #Print Slice of data frame, inclusive, slice can not be printed as series

   cal  dur
0  420   50
1  380   40
2  290   45

cal    420
dur     50
Name: 0, dtype: int64

   cal  dur
0  420   50

   cal  dur
0  420   50
1  380   40


In [None]:
data = {
    "cal": [420, 380, 290],
    "dur": [50,40,45],
    "date": ["12121212", None, None]
}

df = pd.DataFrame(data)
print(df)
print("")

df.fillna( {"date": "11/11/11"}, inplace=True)
df["date"] = pd.to_datetime(df["date"], format='mixed')
print(df)

In [None]:
data = {
    "cal": [420, 380, 290],
    "dur": [50,40,45]
}
df = pd.DataFrame(data)
print(df.corr())

Pandas Data Cleaning Challenge

In [7]:
import pandas as pd
import numpy as np

#Drop rows with invalid data in column
def drop_rows(df:pd.DataFrame, columns:list[str], invalid_data:list):
    for column in columns:
        for x in df.index:
            if(df.loc[x, column] in invalid_data):
                df.drop(x, inplace=True)

#Fix rows with invalid data in column
def fix_rows(df:pd.DataFrame, columns:list[str], invalid_to_correct_data:dict):
    for column in columns:
        for x in df.index:
            if( df.loc[x, column] in invalid_to_correct_data):
                df.loc[x, column] = invalid_to_correct_data[df.loc[x, column]]

#Move data in column to within acceptable range (inclusive)
def clamp_int_data(df:pd.DataFrame, columns:list[str], minimum:int, maximum:int):
    for column in columns:
        for x in df.index:
            if (type(df.loc[x, column]) is int and df.loc[x, column] < minimum):
                df.loc[x, column] = minimum
            elif(type(df.loc[x, column]) is int and df.loc[x, column] > maximum):
                df.loc[x, column] = maximum

#If data in column is a string, convert it to title
def format_as_title(df:pd.DataFrame, columns:list[str]):
    for column in columns:
        for x in df.index:
            if(type(df.loc[x, column]) is str):
                df.loc[x, column] = df.loc[x, column].title()

df = pd.read_json("books_dirty.json")

#1
empty_to_NaN = {"": np.nan}
columns = ["last_checkout"]
fix_rows(df, columns, empty_to_NaN)

#2
minimum = 1800
maximum = 2026
columns = ["publication_year"]
clamp_int_data(df, columns, minimum, maximum)

#3
#Replacing all incorrect values with NaN
columns = ["publication_year", "page_count", "average_rating", "ratings_count", "price_usd", "in_print", "sales_millions", "last_checkout", "available"]
incorrect_to_NaN = {"Unknown": np.nan, "N/A": np.nan, "Unknown": np.nan}
fix_rows(df, columns, incorrect_to_NaN)

#Fixing "true"/"false" to actual boolean values
columns = ["in_print", "available"]
string_to_bool = {"true": True, "false": False}
fix_rows(df, columns, string_to_bool)

#Clamp negative values
minimum = 0
maximum = 999999999
columns = ["page_count", "price_usd", "sales_millions"]
clamp_int_data(df, columns, minimum, maximum)

#4
df["last_checkout"] = pd.to_datetime(df["last_checkout"], format="mixed")

#5
columns = ["genre", "language", "format", "publisher"]
format_as_title(df, columns)

#6
df.drop_duplicates(subset=["author", "title"], inplace=True)

#7
#df.dropna(inplace=True)

#Change ending to .com
for x in df.index:
    seperated = df.loc[x, "publisher_email"].split('.')
    if seperated[-1] == "uk":
        seperated = seperated[:-2]
    else:
        seperated = seperated[:-1]

    df.loc[x, "publisher_email"] = ""
    for section in seperated:
        df.loc[x, "publisher_email"] += section + "."
    df.loc[x, "publisher_email"] += "com"

print(df)



                                  book_id          title     author  \
0    c28ad32d-8f9e-4456-b580-ad91d73205c4  Book Title 15  Author 27   
1    dae4fc9f-9f13-4b72-9fa2-cf35979d764d   Book Title 6  Author 12   
2    3a596637-6e4f-4a1d-92dd-3d30be3d203c  Book Title 17  Author 13   
3    5076ac12-dcad-4f11-b501-de89c23e5917  Book Title 11  Author 13   
4    c0a021f7-0149-4417-aa90-f4d0564a3b86   Book Title 6  Author 26   
..                                    ...            ...        ...   
487  de320c87-b06d-44f1-b557-2ab9c71ef628   Book Title 5   Author 8   
491  61e5c712-8519-48b7-9445-8802711a1409  Book Title 19  Author 25   
492  8fde4bbf-1b42-43eb-8afe-67d88c6e06d0   Book Title 5  Author 21   
496  ba41d3bd-41f2-4ccc-8def-94f6f678284b   Book Title 4  Author 24   
499  1dc98d4c-7012-48da-812f-c7055c849a6f   Book Title 9  Author 26   

               genre publication_year page_count average_rating ratings_count  \
0            Romance             None       None            NaN   