1. Statistical Hypothesis Testing           
Example: Flipping a Coin

In [2]:
import numpy as np
from scipy import stats

# Flipping a fair coin 100 times
np.random.seed(0)
flips = np.random.binomial(n=1, p=0.5, size=100)

# Null hypothesis: the coin is fair (p=0.5)
# Alternative hypothesis: the coin is not fair (p != 0.5)

# Performing a binomial test
successes = np.sum(flips)
n = len(flips)
p_value = stats.binom_test(successes, n, p=0.5, alternative='greater')

print(f"Number of successes (heads): {successes}")
print(f"p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Reject the null hypothesis: the coin is not fair.")
else:
    print("Fail to reject the null hypothesis: the coin is fair.")


AttributeError: module 'scipy.stats' has no attribute 'binom_test'

2. p-Values             
Example: One-sample t-test

In [None]:
import numpy as np
from scipy import stats

# Sample data: weights of 10 apples
np.random.seed(0)
weights = np.random.normal(loc=150, scale=10, size=10)

# Null hypothesis: mean weight = 150
# Alternative hypothesis: mean weight != 150

# Performing a one-sample t-test
mean_weight = 150
t_stat, p_value = stats.ttest_1samp(weights, mean_weight)

print(f"Sample weights: {weights}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Reject the null hypothesis: the mean weight is not 150.")
else:
    print("Fail to reject the null hypothesis: the mean weight is 150.")


3. Confidence Intervals     
Example: Confidence Interval for the Mean

In [None]:
import numpy as np
from scipy import stats

# Sample data: weights of 10 apples
np.random.seed(0)
weights = np.random.normal(loc=150, scale=10, size=10)

# Calculate the mean and standard error of the sample
mean_weight = np.mean(weights)
sem = stats.sem(weights)

# Calculate the 95% confidence interval
confidence_level = 0.95
ci = stats.t.interval(confidence_level, len(weights) - 1, loc=mean_weight, scale=sem)

print(f"Sample weights: {weights}")
print(f"Mean weight: {mean_weight:.2f}")
print(f"95% Confidence interval: {ci}")


4. p-Hacking        
Example: Multiple Hypothesis Testing

In [None]:
import numpy as np
from scipy import stats

# Generate random data for 20 tests
np.random.seed(0)
data = np.random.normal(loc=0, scale=1, size=(20, 100))

# Perform t-tests for each test
p_values = [stats.ttest_1samp(sample, 0).pvalue for sample in data]

# Correct for multiple comparisons using Bonferroni correction
alpha = 0.05
corrected_alpha = alpha / len(p_values)
significant_tests = [p < corrected_alpha for p in p_values]

print(f"p-values: {p_values}")
print(f"Corrected alpha: {corrected_alpha:.4f}")
print(f"Significant tests: {significant_tests}")
print(f"Number of significant tests: {sum(significant_tests)}")


5. Example: Running an A/B Test     
Example: A/B Test

In [None]:
import numpy as np
from scipy import stats

# Simulated data for an A/B test
np.random.seed(0)
control = np.random.binomial(1, 0.4, 100)  # Control group with 40% success rate
treatment = np.random.binomial(1, 0.5, 100)  # Treatment group with 50% success rate

# Perform a two-sample t-test
t_stat, p_value = stats.ttest_ind(control, treatment)

print(f"Control group mean: {np.mean(control):.2f}")
print(f"Treatment group mean: {np.mean(treatment):.2f}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Reject the null hypothesis: there is a significant difference between the groups.")
else:
    print("Fail to reject the null hypothesis: there is no significant difference between the groups.")


6. Bayesian Inference       
Example: Bayesian Updating for a Binomial Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import beta

# Prior parameters
alpha_prior = 2
beta_prior = 2

# Observed data: 6 heads and 4 tails
successes = 6
failures = 4

# Posterior parameters
alpha_post = alpha_prior + successes
beta_post = beta_prior + failures

# Plotting the prior and posterior distributions
x = np.linspace(0, 1, 100)
prior = beta(alpha_prior, beta_prior).pdf(x)
posterior = beta(alpha_post, beta_post).pdf(x)

plt.figure(figsize=(10, 6))
plt.plot(x, prior, label=f'Prior: Beta({alpha_prior}, {beta_prior})')
plt.plot(x, posterior, label=f'Posterior: Beta({alpha_post}, {beta_post})')
plt.xlabel('Probability of Success')
plt.ylabel('Density')
plt.legend()
plt.title('Prior and Posterior Distributions')
plt.show()


7. Gradient Descent     
Example: Gradient Descent to Minimize a Quadratic Function

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Quadratic function: f(x) = x^2
def f(x):
    return x ** 2

# Derivative of the quadratic function: f'(x) = 2x
def f_prime(x):
    return 2 * x

# Gradient descent parameters
x0 = 10  # Initial guess
learning_rate = 0.1
iterations = 20

# Performing gradient descent
x = x0
x_values = [x]

for _ in range(iterations):
    x -= learning_rate * f_prime(x)
    x_values.append(x)

# Plotting the function and the gradient descent steps
x_range = np.linspace(-10, 10, 400)
y_range = f(x_range)

plt.figure(figsize=(10, 6))
plt.plot(x_range, y_range, label='f(x) = x^2')
plt.scatter(x_values, [f(x) for x in x_values], color='red')
plt.plot(x_values, [f(x) for x in x_values], color='red', linestyle='--')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Gradient Descent to Minimize f(x) = x^2')
plt.legend()
plt.show()


8. Minibatch and Stochastic Gradient Descent        
Example: Stochastic Gradient Descent (SGD) for Linear Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Generate synthetic data for linear regression
np.random.seed(0)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Add bias term (intercept) to the feature matrix
X_b = np.c_[np.ones((100, 1)), X]

# Initialize parameters
theta = np.random.randn(2, 1)
learning_rate = 0.1
iterations = 50

# Stochastic Gradient Descent
for iteration in range(iterations):
    for i in range(100):
        random_index = np.random.randint(100)
        xi = X_b[random_index:random_index+1]
        yi = y[random_index:random_index+1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        theta -= learning_rate * gradients

# Plotting the data and the linear regression line
plt.figure(figsize=(10, 6))
plt.scatter(X, y)
plt.plot(X, X_b.dot(theta), color='red')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Stochastic Gradient Descent for Linear Regression')
plt.show()


9. Data Handling        
Example: Reading Files

In [None]:
# Example data file: 'data.txt'
# Content of 'data.txt':
# 1,2,3
# 4,5,6
# 7,8,9

def read_file(filename):
    with open(filename, 'r') as file:
        data = file.readlines()
    data = [line.strip().split(',') for line in data]
    return data

filename = 'data.txt'
data = read_file(filename)
print(data)


10. Scraping the Web        
Example: Web Scraping with BeautifulSoup

In [None]:
import requests
from bs4 import BeautifulSoup

# URL to scrape
url = 'http://example.com'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract and print the title of the webpage
title = soup.title.string
print('Title:', title)

# Extract and print all paragraph texts
paragraphs = soup.find_all('p')
for p in paragraphs:
    print(p.text)


11. Using APIs      
Example: Using the Twitter API

In [None]:
import tweepy

# Replace with your own credentials
consumer_key = 'your_consumer_key'
consumer_secret = 'your_consumer_secret'
access_token = 'your_access_token'
access_token_secret = 'your_access_token_secret'

# Authenticate to Twitter
auth = tweepy.OAuth1UserHandler(consumer_key, consumer_secret, access_token, access_token_secret)
api = tweepy.API(auth)

# Get the User object for twitter
user = api.get_user(screen_name='twitter')

print('User details:')
print('Name:', user.name)
print('Screen name:', user.screen_name)
print('Location:', user.location)
print('Description:', user.description)

print('Last 10 Followers:')
for follower in user.followers(count=10):
    print(follower.name)


12. Working with Data       
Exploring Your Data     

Example: Basic Data Exploration with Pandas

In [None]:
import pandas as pd

# Create a sample DataFrame
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [5, 4, 3, 2, 1],
    'C': [2, 3, 4, 5, 6]
}
df = pd.DataFrame(data)

# Display basic information about the DataFrame
print(df.info())

# Display basic statistics of the DataFrame
print(df.describe())

# Display the first few rows of the DataFrame
print(df.head())

# Display the correlation matrix of the DataFrame
print(df.corr())


13. Using NamedTuples       
Example: Creating and Using NamedTuples

In [None]:
from collections import namedtuple

# Define a namedtuple
Point = namedtuple('Point', ['x', 'y'])

# Create an instance of the namedtuple
p = Point(1, 2)

# Access elements by name
print('x:', p.x)
print('y:', p.y)

# Access elements by index
print('x:', p[0])
print('y:', p[1])


14. Dataclasses     
Example: Creating and Using Dataclasses

In [None]:
from dataclasses import dataclass

@dataclass
class Point:
    x: int
    y: int

# Create an instance of the dataclass
p = Point(1, 2)

# Access elements
print('x:', p.x)
print('y:', p.y)

# Print the dataclass instance
print(p)


15. Cleaning and Munging        
Example: Data Cleaning with Pandas

In [None]:
import pandas as pd

# Create a sample DataFrame with missing values
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, 4, 3, 2, 1],
    'C': [2, None, 4, None, 6]
}
df = pd.DataFrame(data)

# Display the original DataFrame
print('Original DataFrame:')
print(df)

# Drop rows with missing values
df_dropped = df.dropna()
print('\nDataFrame after dropping rows with missing values:')
print(df_dropped)

# Fill missing values with a specified value
df_filled = df.fillna(0)
print('\nDataFrame after filling missing values with 0:')
print(df_filled)

# Fill missing values with the mean of the column
df_filled_mean = df.fillna(df.mean())
print('\nDataFrame after filling missing values with column mean:')
print(df_filled_mean)


16. Manipulating Data       
Example: Data Manipulation with Pandas

In [None]:
import pandas as pd

# Create a sample DataFrame
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [5, 4, 3, 2, 1],
    'C': [2, 3, 4, 5, 6]
}
df = pd.DataFrame(data)

# Adding a new column
df['D'] = df['A'] + df['B']
print('DataFrame after adding column D:')
print(df)

# Renaming columns
df.rename(columns={'A': 'Alpha', 'B': 'Beta'}, inplace=True)
print('\nDataFrame after renaming columns:')
print(df)

# Filtering rows
filtered_df = df[df['Alpha'] > 2]
print('\nFiltered DataFrame where Alpha > 2:')
print(filtered_df)

# Grouping and aggregating data
grouped_df = df.groupby('C').mean()
print('\nGrouped and aggregated DataFrame (mean of each group in column C):')
print(grouped_df)


17. Rescaling       
Example: Data Rescaling with Scikit-Learn

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Create a sample DataFrame
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [5, 4, 3, 2, 1],
    'C': [2, 3, 4, 5, 6]
}
df = pd.DataFrame(data)

# Min-Max Scaling
min_max_scaler = MinMaxScaler()
df_min_max_scaled = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns)
print('DataFrame after Min-Max Scaling:')
print(df_min_max_scaled)

# Standard Scaling (Z-score normalization)
standard_scaler = StandardScaler()
df_standard_scaled = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)
print('\nDataFrame after Standard Scaling:')
print(df_standard_scaled)


18. An Aside: tqdm      
Example: Progress Bar with tqdm

In [None]:
from tqdm import tqdm
import time

# Simulate a process that takes time using tqdm
for i in tqdm(range(10)):
    time.sleep(0.5)  # Simulate work by sleeping for 0.5 seconds


19. Dimensionality Reduction        
Example: PCA with Scikit-Learn

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Create a sample DataFrame
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [5, 4, 3, 2, 1],
    'C': [2, 3, 4, 5, 6]
}
df = pd.DataFrame(data)

# Perform PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(df)
df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Plot the PCA result
plt.figure(figsize=(8, 6))
plt.scatter(df_pca['PC1'], df_pca['PC2'])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Sample Data')
plt.show()
