In [38]:
import pandas as pd 
import numpy as np
import os


df = pd.read_csv("music.csv")

### filtering

In [14]:
out = df[~((df.country=='US') & (df.plays >= 500))]
out = df[df["country"].isin(["US","UK"]) ]
# print(out.artist.values)

### groupby

In [16]:
print(df.groupby('country').plays)   # series
print(df.groupby('country').plays.sum().index)
print(df.groupby('country').plays.sum().to_dict())
print(df.groupby(['country', 'genre']).plays.sum().to_dict())

### reset_index

In [18]:
non_reset = df.groupby('country').plays.sum()           # countries are the index
reset = df.groupby('country').plays.sum().reset_index() # reset the index to numbers
print((reset[reset.plays>1000].country.values))

### apply

In [None]:
df.groupby('country').apply(lambda x: x.plays.sum() / x.fans.sum())
df.groupby('country').apply(lambda x: x.plays.sum() / x.fans.sum()).to_dict()

In [29]:
def is_popular(plays, fans):
    if plays > 1000 or fans > 50:
        return 'very popular'
    elif plays >= 500:
        return 'popular'
    else:
        return 'not popular'
    


df = pd.read_csv('music.csv')
df['popular'] = df.apply(lambda x: is_popular(x.plays, x.fans), axis=1)
df.set_index('artist')['popular'].to_dict()

In [31]:
df = pd.read_csv('sales.csv').set_index('Band')

replaced = df.applymap(lambda x: int(x.replace('k', '')) * 1000)
mx = replaced.apply(lambda x: max(x), axis=0)
mx.to_dict()

{'US Sales': 10000, 'UK Sales': 20000, 'Egypt Sales': 5000}

### Aggregate

Your music analyst is interested in knowing multiple statistics at once, grouped by country.These are as follows:

- Sum of plays
- Average of plays
- Maximum fans from all artists in the country

In [22]:
grp = df.groupby('country').agg({'plays': ['sum', 'mean'], 'fans': ['max']})
display(grp)

Unnamed: 0_level_0,plays,plays,fans
Unnamed: 0_level_1,sum,mean,max
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Egypt,200,200.0,10
Finland,250,250.0,10
UK,35150,8787.5,3500
US,2050,512.5,80


### to datetime

In [25]:
df = pd.read_csv('dates.csv')

df['date'] = pd.to_datetime(df['date'])

resampled = df.set_index('date').resample('MS').size()   # MS : Month Start

resampled.index = resampled.index.map(lambda x: x.strftime("%Y-%m"))

### map

In [27]:
df = pd.read_csv('music.csv')
continents = {'UK': 'Europe', 'US': 'North America', 'Egypt': 'Africa', 'Finland': 'Europe'}

df ["continent"] = df["country"].map(lambda x : continents[x])

### basic merge 

In [34]:
df = pd.read_csv('music_2.csv')
countries = pd.read_csv('countries.csv')

merged = df.merge(countries, left_on='country', right_on='country_id')



### merge with missing values

In [None]:
df = pd.read_csv('music_2.csv')
countries = pd.read_csv('countries.csv')
merged = pd.merge(df, countries, how='left', left_on='country', right_on='country_id')
merged[merged.name.isnull()].plays.sum()

## MORE PANDAS

In [47]:
path = "./Sales_Data"       
files = [ file for file in os.listdir(path) if not file.startswith(".")]

df_all = pd.DataFrame()

for file in files:
   temp = pd.read_csv(path + "/" + file) 
   df_all =  pd.concat([df_all,temp])
    

df_all.to_csv("all_sales_data.csv", index=False)   
df = pd.read_csv("all_sales_data.csv")

df.head(1)

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,209921,USB-C Charging Cable,1,11.95,06/23/19 19:34,"950 Walnut St, Portland, ME 04101"


### drop nan  rows

In [48]:
df_nan = df[df.isna().any(axis = 1)]

df = df.dropna(how="all")
df.shape

(186305, 6)

### filtering

In [49]:
df = df[df['Order Date'].str[0:2]!='Or']

### convert data type to numeric

In [50]:
df['Quantity Ordered'] = pd.to_numeric(df['Quantity Ordered'])
df['Price Each'] = pd.to_numeric(df['Price Each'])

### augment data

In [51]:
df["Month"] = df["Order Date"].str[0:2].astype("int32")

In [52]:
df["Month2"] = pd.to_datetime( df["Order Date"]).dt.month

### analysis

In [None]:
df["Sales"] = df["Quantity Ordered"].astype("int")*df["Price Each"].astype("float")

In [53]:
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df = df[['Order ID', 'Grouped']].drop_duplicates()
df

Unnamed: 0,Order ID,Grouped
0,209921,USB-C Charging Cable
1,209922,Macbook Pro Laptop
2,209923,ThinkPad Laptop
3,209924,27in FHD Monitor
4,209925,Bose SoundSport Headphones
...,...,...
186844,176553,USB-C Charging Cable
186845,176554,Lightning Charging Cable
186846,176555,27in FHD Monitor
186847,176556,AAA Batteries (4-pack)


## MORE PANDAS

### some more filtering

In [None]:
df = df['playerID'].str.startswith('c')
df = df['teamID'].str.endswith('S')
df = ~df['playerID'].str.contains('o')

isna = df['teamID'].isna()
notna = df['teamID'].notna()



### sorting

In [None]:
df.sort_values('playerID', ascending=False)

### for categorical values 

In [None]:

p_ids = df['playerID']
p_ids.value_counts()
p_ids.value_counts(normalize=True))

df['playerID'].unique()

### conver to numpy 

In [None]:
n_matrix = df.values

### metrics

In [None]:
df.describe()