In [77]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
#

In [73]:
Training_data = pd.read_csv('training_data.csv',parse_dates=['start','end','filed'])
Training_data['filing age'] = (datetime.now() - Training_data['filed']).dt.days
Feature_data = Training_data.dropna()
Result_data = Feature_data['Diluted EPS PA']
Feature_data = Feature_data.drop(columns=['start','end','filed','form','Diluted EPS PA']).copy()

In [74]:

df = Feature_data.copy()
# Define the columns for PCA
columns_for_pca = df.columns.tolist()

# Handle categorical column 'ticker' with ordinal encoding
categorical_features = ['ticker']
numerical_features = list(set(columns_for_pca) - set(categorical_features))

# Preprocessing for numerical data: fill missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: ordinal encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder()),
    ('scaler', StandardScaler())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the transformations to the data
array_pca_ready = preprocessor.fit_transform(df)

# Generate the output column names
numerical_columns = numerical_features
encoded_categorical_columns = [f"{feature}_{i}" for feature in categorical_features for i in range(df[categorical_features[0]].nunique())]
output_columns = numerical_columns + categorical_features

# Save to a new CSV file if needed
df_pca_ready =pd.DataFrame(array_pca_ready, columns=output_columns)
#print(df_pca_ready.head())
df_pca_ready.to_csv('pca_input.csv', index=False)

In [88]:
cov_matrix = df_pca_ready.cov()

In [None]:
# Perform PCA iteratively to identify the right number of components
explained = np.empty(60)
for i in range(60):
    PCA_model = PCA(n_components=i)
    principalcomponents = PCA_model.fit(array_pca_ready)
    explained[i] = sum(principalcomponents.explained_variance_ratio_)
    #if sum(PCA_model.explained_variance_ratio_) >0.9999:
        #break
i = 20
print(f"{i} Components explains {explained[i]:0.2%} of the variance")
# Based on this data set 20 components explain 98.5% of the variance
    

20 Components explains 98.57% of the variance


In [85]:
PCA_model = PCA(n_components=40)
feature_data_PCA = PCA_model.fit_transform(array_pca_ready)

X_train, X_test, y_train, y_test = train_test_split(
    feature_data_PCA, Result_data,
    shuffle=True, test_size=0.1, random_state=0)

# Train the regression model on the transformed training set
model = LinearRegression() # or any other regression model
model.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)


Mean Squared Error (MSE): 91636934898.63142
Root Mean Squared Error (RMSE): 302715.93102879706
R-squared: -0.15408770950250483
