In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer

url_income = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns_income = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'native-country', 'income']
df_income = pd.read_csv(url_income, header=None, names=columns_income, na_values=' ?', skipinitialspace=True)

df_income.dropna(inplace=True)

grouped_income = df_income.groupby('income')['age'].agg(['mean', 'median', 'min', 'max', 'std'])
print("Summary Statistics Grouped by 'income':")
print(grouped_income)

income_groups = df_income.groupby('income')['age'].apply(list)
print("\nList of numeric values for each response to 'income':")
print(income_groups)

url_iris = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
columns_iris = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
iris_df = pd.read_csv(url_iris, header=None, names=columns_iris)

iris_stats = iris_df.groupby('species').describe()
print("\nBasic Statistical Details of Iris Species:")
print(iris_stats)

X = iris_df[['sepal_length', 'sepal_width', 'petal_length']]
y = iris_df['petal_width']

model = LinearRegression()
model.fit(X, y)
original_predictions = model.predict(X)
print("\nOriginal Linear Regression Coefficients:")
print(model.coef_)

pt = PowerTransformer(method='box-cox')
X_transformed = pt.fit_transform(X)

model_transformed = LinearRegression()
model_transformed.fit(X_transformed, y)
transformed_predictions = model_transformed.predict(X_transformed)
print("\nTransformed Linear Regression Coefficients:")
print(model_transformed.coef_)

print("\nComparison of Original and Transformed Linear Regression Coefficients:")
comparison_df = pd.DataFrame({
    'Original Coefficients': model.coef_,
    'Transformed Coefficients': model_transformed.coef_
})
print(comparison_df)


Summary Statistics Grouped by 'income':
             mean  median  min  max        std
income                                        
<=50K   36.783738    34.0   17   90  14.020088
>50K    44.249841    44.0   19   90  10.519028

List of numeric values for each response to 'income':
income
<=50K    [39, 50, 38, 53, 28, 37, 49, 23, 32, 34, 25, 3...
>50K     [52, 31, 42, 37, 30, 40, 43, 40, 56, 54, 31, 5...
Name: age, dtype: object

Basic Statistical Details of Iris Species:
                sepal_length                                              \
                       count   mean       std  min    25%  50%  75%  max   
species                                                                    
Iris-setosa             50.0  5.006  0.352490  4.3  4.800  5.0  5.2  5.8   
Iris-versicolor         50.0  5.936  0.516171  4.9  5.600  5.9  6.3  7.0   
Iris-virginica          50.0  6.588  0.635880  4.9  6.225  6.5  6.9  7.9   

                sepal_width         ... petal_length      petal_wi