In [1]:
import pandas as pd

df1 = pd.DataFrame([['a', 1], ['b', 2]], columns=['letter', 'number'])
df2 = pd.DataFrame([['c', 1], ['d', 2]], columns=['letter', 'number'])

result = pd.concat([df1, df2], axis=0).reset_index(drop=True)
print(result)


  letter  number
0      a       1
1      b       2
2      c       1
3      d       2


In [2]:
# Create DataFrames
df1 = pd.DataFrame({
    'id': ['1','2','3','4','5'],
    'Feature1': ['A','C','E','G','I'],
    'Feature2': ['B','D','F','H','J']
})

df2 = pd.DataFrame({
    'id': ['1','2','6','7','8'],
    'Feature1': ['K','M','O','Q','S'],
    'Feature2': ['L','N','P','R','T']
})

# Question 1: Inner merge (matching ids only)
merged_inner = pd.merge(df1, df2, on='id')
print(merged_inner)

# Question 2: Outer merge (all ids) with custom suffixes
merged_outer = pd.merge(df1, df2, on='id', how='outer', suffixes=('_df1','_df2'))
print(merged_outer)

  id Feature1_x Feature2_x Feature1_y Feature2_y
0  1          A          B          K          L
1  2          C          D          M          N
  id Feature1_df1 Feature2_df1 Feature1_df2 Feature2_df2
0  1            A            B            K            L
1  2            C            D            M            N
2  3            E            F          NaN          NaN
3  4            G            H          NaN          NaN
4  5            I            J          NaN          NaN
5  6          NaN          NaN            O            P
6  7          NaN          NaN            Q            R
7  8          NaN          NaN            S            T


In [3]:
import pandas as pd
import numpy as np

# Generate dates
all_dates = pd.date_range('2021-01-01', '2021-12-15')
business_dates = pd.bdate_range('2021-01-01', '2021-12-31')

# Generate tickers
tickers = ['AAPL','FB','GE','AMZN','DAI']

# Create MultiIndex
index_alt = pd.MultiIndex.from_product([all_dates, tickers], names=['Date','Ticker'])
index = pd.MultiIndex.from_product([business_dates, tickers], names=['Date','Ticker'])

# Create DataFrames
market_data = pd.DataFrame(
    np.random.randn(len(index),3),
    index=index,
    columns=['Open','Close','Close_Adjusted']
)

alternative_data = pd.DataFrame(
    np.random.randn(len(index_alt),2),
    index=index_alt,
    columns=['Twitter','Reddit']
)
merged_df = market_data.merge(
    alternative_data,
    how='left',
    left_index=True,
    right_index=True
)
print(merged_df.shape)  # Should be (1305, 5)
print(merged_df.head())

(1305, 5)
                       Open     Close  Close_Adjusted   Twitter    Reddit
Date       Ticker                                                        
2021-01-01 AAPL    1.161921 -1.586993       -0.709861 -0.380245 -1.370224
           FB      1.097984 -1.367535        0.641274 -0.216462  0.742371
           GE     -0.936785  0.830933        1.506496  0.695036  1.440038
           AMZN   -0.876961  0.769495       -1.647499 -0.139150  0.267542
           DAI     1.087713  0.261699       -0.291028 -0.264503 -0.182004


In [4]:
# import pandas as pd
# import numpy as np

def winsorize(df_series, quantiles):
    """
    df_series: pd.Series or single-column DataFrame
    quantiles: list [lower_quantile, upper_quantile], e.g., [0.05, 0.95]
    
    Returns the winsorized Series or DataFrame column
    """
    min_value = np.quantile(df_series, quantiles[0])
    max_value = np.quantile(df_series, quantiles[1])
    
    return df_series.clip(lower=min_value, upper=max_value)

df = pd.DataFrame(range(1,11), columns=['sequence'])

winsorized_df = winsorize(df['sequence'], [0.2, 0.8])
# print(winsorized_df.to_markdown())

groups = np.concatenate([np.ones(10), np.ones(10)+1, np.ones(10)+2, np.ones(10)+3, np.ones(10)+4])
df_grouped = pd.DataFrame(zip(groups, range(1,51)), columns=['group', 'sequence'])
winsorized_grouped = df_grouped.groupby('group')[['sequence']].apply(winsorize, [0.05, 0.95])
winsorized_grouped = winsorized_grouped.reset_index(level=0, drop=True)

print(winsorized_grouped.head(11).to_markdown())

|    |   sequence |
|---:|-----------:|
|  0 |       1.45 |
|  1 |       2    |
|  2 |       3    |
|  3 |       4    |
|  4 |       5    |
|  5 |       6    |
|  6 |       7    |
|  7 |       8    |
|  8 |       9    |
|  9 |       9.55 |
| 10 |      11.45 |


In [5]:


df = pd.DataFrame({
    'value': [20.45, 22.89, 32.12, 111.22, 33.22, 100, 99.99],
    'product': ['table', 'chair', 'chair', 'mobile phone', 'table', 'mobile phone', 'table']
})
agg_df = df.groupby('product').agg({'value': ['min', 'max', 'mean']})
agg_df.columns = ['_'.join(col) for col in agg_df.columns]
# Resulting columns: ['value_min', 'value_max', 'value_mean']
print(agg_df)

              value_min  value_max  value_mean
product                                       
chair             22.89      32.12      27.505
mobile phone     100.00     111.22     105.610
table             20.45      99.99      51.220
