In [1]:
#standard import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#seaborn import
import seaborn as sns

In [2]:
#pandas
#create series
bacteria = pd.Series([632, 1638, 569, 115], 
    index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])

[name.endswith('bacteria') for name in bacteria.index] #mask

#apply function
df1 = df.apply(lambda x: x * x, axis=0) #axis=0: apply to each column axis=1: apply to each row

#import csv
pd.read_csv()

#merge
left.merge(right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False)

#concat
pd.concat([mb1, mb2], axis=1) #axis=0: row, axis=1: column

#duplicate:
df.duplicated(subset='names') #subset: column name
df.drop_duplicates(['names'])

#groupby
df.groupby('team').agg(sum_points=('points', lambda x: x.nunique()),
                       mean_assists=('assists', 'mean'),
                       max_rebounds=('rebounds', 'max'))

#year/month extraction
df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = df['datetime'].dt.year 
df['month'] = df['datetime'].dt.month 

[False, True, True, False]

In [None]:
#stats

#significant different
from scipy import stats
stats.ttest_ind(dm,df)

In [None]:
#plot tricks

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 6))

ax1.set_title('Current')
ax1.set_xlabel('Time [s]')
ax1.set_ylabel('Current [A]')
ax1.legend()

plt.xscale("log")

plt.tight_layout()

#sns tricks
#barplot
sns.barplot(x=data[:n].keys(), y=data[:n], ax=ax)

# Box plot horizontal boxes
ax = sns.boxplot(x="text_len", y="category", data=data)
# Add in points to show each observation
sns.stripplot(x="text_len", y="category", data=data)

plt.boxplot(df['x'])

# Creates complementary CDF
sns.ecdfplot(df[df["throws"] == "L"].salary, label="Left-handed", complementary=True)

#plot with confidence intervalle
sns.lineplot(x="month", y="numwords1", data=df_final_after_april14, label='winner')

#historgram
df['x'].hist(bins = 100)

#scatterplot
plt.scatter(df['x'], df['y'])
#see 2 distributions with scatter
sns.jointplot(x=movies['worldwide_gross'], y=movies['imdb_rating'], kind="hex")

#heatmap
df2 = pd.crosstab(df['x'], df['y']) #By default, computes a frequency table of the factors unless an array of values and an aggregation function are passed.
df3 = pd.crosstab(df['x'], df['y'], values = df['values'], margins=False, aggfunc='sum')
sns.heatmap(df2, annot=True, vmin = 0, vmax = 20) #df2 has 2 columns // can bbe remplaced by df3

In [None]:
#JSON tricks

In [None]:
#Networkx tricks
import networkx as nx
G = nx.MultiDiGraph()
edge_list = pd.read_csv("./data/part-1/edgelist.tsv", sep="\t")
node_list = pd.read_csv("./data/part-1/nodelist.tsv", sep="\t")

# Creates node attributes
for _, node in node_list.iterrows():
    node = dict(node)
    G.add_node(node['u'], score=node['score'], name=node['name'])

# Creates edge attributes
for _, edge in edge_list.iterrows():
    edge = dict(edge)
    G.add_edge(edge['u'], edge['v'], gender=edge['gender'])

#show attribute
for node_ in G.nodes.items():
    print(node_)

In [None]:
#ML tricks 

#onehot
tracks = df["track"].unique()
# rename columns to <track>-onehot
for track in tracks:
   df[track+'-onehot'] = (df['track'] == track).astype(int)

#transform in binary
df['adopted'] = df.outcome_type.apply(lambda r: 1 if r=='Adoption' else 0)
df = pd.get_dummies(df, columns=columns_to_dummies) #Each variable is converted in as many 0/1 variables as there are different values


#split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#sample for bootstrap --> see exam 2022 or 2019 (without df)
#with DF
df.sample(frac=2, replace=True, random_state=1)

#general method
def simple_bootstrap(f, val):
    bs = random.choice(val, size=(len(val), 10000), replace=True)
    val_ = f(bs)
    lower, upper = np.quantile(val_, q=[0.025, 0.975])
    return val.mean(), lower, upper

simple_bootstrap(lambda x: np.mean(x, axis=0), random.random(1000))

#95 confidence intervalle
print("95% CI:", np.quantile( np.array(DIFF), q=[0.025, 0.975]))

#balancing dataset --> look exo05 or exam 2020 or cry

#statsmodel
import statsmodels.api as sm 
import statsmodels.formula.api as smf
mod = smf.ols(formula='time ~ diabets + high_blood_pressure', data=df) #log transform possible -- see ex04
np.random.seed(2)
res = mod.fit()
print(res.summary())
coefficients = res.params.values
p_values = res.pvalues
standard_errors = res.bse.values

#sklearn model
from sklearn.linear_model import LogisticRegression

X_train = X_train.iloc[:,-5497:]

reg= LogisticRegression(random_state=0).fit(X_train,y_train)

predict = reg.predict(X_test)
score = reg.score(X_test,y_test)

reg.coef_
reg.intercept_

#confusion matrix - exo7
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

def plot_confusion_matrix(confusion_matrix):
    [[TP, FP],[FN, TN]] = confusion_matrix
    label = np.asarray([['TP {}'.format(TP), 'FP {}'.format(FP)],
                        ['FN {}'.format(FN), 'TN {}'.format(TN)]])
    
    df_cm = pd.DataFrame(confusion_matrix, index=['Yes', 'No'], columns=['Positive', 'Negative']) 
    
    return sn.heatmap(df_cm, cmap='YlOrRd', annot=label, annot_kws={"size": 16}, cbar=False, fmt='')

#clustering - exo08
from sklearn.cluster import KMeans, DBSCAN
kmean = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
kmean.cluster_centers_