In [None]:
pip install pandas scikit-learn surprise


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357258 sha256=01a713837d9762d437125f0c7d2691eccde086a059d2e090caceca42f9926808
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully inst

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Sample user profiles and investment options data
user_profiles = pd.DataFrame({
    'user_id': [1, 2, 3, 4, 5],
    'age': [25, 35, 45, 32, 28],
    'risk_tolerance': [5, 2, 3, 4, 1],  # 1 to 5 scale
    'investment_goal': ['growth', 'income', 'balanced', 'growth', 'income']
})

investment_options = pd.DataFrame({
    'investment_id': [101, 102, 103, 104, 105],
    'type': ['stocks', 'bonds', 'etf', 'mutual_funds', 'real_estate'],
    'risk_level': [5, 2, 3, 4, 1],  # 1 to 5 scale
    'expected_return': [0.15, 0.05, 0.08, 0.12, 0.07]
})

# Example user preferences for training (user_id, investment_id, rating)
user_preferences = pd.DataFrame({
    'user_id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
    'investment_id': [101, 102, 103, 104, 105, 102, 103, 104, 105, 101],
    'rating': [5, 3, 4, 5, 2, 4, 3, 4, 5, 1]  # Scale of 1 to 5
})

# Create a Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(user_preferences[['user_id', 'investment_id', 'rating']], reader)

# Use the SVD algorithm
algo = SVD()

# Train the algorithm on the whole dataset
trainset = data.build_full_trainset()
algo.fit(trainset)

# Predict ratings for all pairs (u, i) not in the training set
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

# Get top N recommendations for each user
def get_top_n(predictions, n=5):
    # First map the predictions to each user.
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=3)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(f"User {uid}")
    for iid, rating in user_ratings:
        print(f"  Investment {iid}: predicted rating {rating:.2f}")



User 1
  Investment 105: predicted rating 3.87
  Investment 104: predicted rating 3.73
  Investment 103: predicted rating 3.68
User 2
  Investment 104: predicted rating 3.50
  Investment 105: predicted rating 3.37
  Investment 101: predicted rating 3.28
User 3
  Investment 105: predicted rating 3.76
  Investment 101: predicted rating 3.58
  Investment 102: predicted rating 3.43
User 4
  Investment 102: predicted rating 3.84
  Investment 103: predicted rating 3.77
  Investment 101: predicted rating 3.61
User 5
  Investment 102: predicted rating 3.35
  Investment 104: predicted rating 3.23
  Investment 103: predicted rating 3.06


In [None]:
import pandas as pd

# Sample user profile data
user_profiles = {
  "user1": {
    "risk_tolerance": "low",
    "esg_importance": "low",
    "investment_goals": ["retirement", "income generation"]
  },

}

# Sample investment options data with ESG ratings
investment_options = {
  "stock1": {
    "type": "stock",
    "expected_return": 0.10,
    "risk": "low",
    "esg_rating": 80
  },
  "stock2": {
    "type": "stock",
    "expected_return": 0.15,
    "risk": "medium",
    "esg_rating": 50
  },
  "bond1": {
    "type": "bond",
    "expected_return": 0.05,
    "risk": "low",
    "esg_rating": 90
  },
  "etf1": {
    "type": "etf",
    "expected_return": 0.12,
    "risk": "medium",
    "esg_rating": 70
  },
}

def recommend_portfolio(user_profile, investment_options):
  """
  Recommends a diversified portfolio based on user profile and investment options.

  Args:
      user_profile (dict): User profile data containing risk tolerance, ESG importance, and goals.
      investment_options (dict): Dictionary of investment options with details like type, expected return, risk, and ESG rating.

  Returns:
      list: List of recommended investment options with allocation percentages.
  """
  # Define weights based on user profile and goals (adjust based on your strategy)
  risk_weight = {"low": 0.6, "medium": 0.3, "high": 0.1}
  esg_weight = {"low": 0.2, "medium": 0.5, "high": 0.8}
  growth_weight = {"retirement": 0.3, "income generation": 0.4, "growth": 0.6}

  # Calculate overall weights for each factor
  user_risk_weight = risk_weight[user_profile["risk_tolerance"]]
  user_esg_weight = esg_weight[user_profile["esg_importance"]]
  user_growth_weight = sum([growth_weight[goal] for goal in user_profile["investment_goals"]])

  # Combine weights (adjust based on your preference)
  combined_weight = 0.5 * user_risk_weight + 0.3 * user_esg_weight + 0.2 * user_growth_weight

  # Filter options based on user risk tolerance
  filtered_options = {option: data for option, data in investment_options.items() if data["risk"] <= user_profile["risk_tolerance"]}

  # Score each option based on weighted factors (adjust scoring logic as needed)
  scored_options = {}
  for option, data in filtered_options.items():
    score = (data["expected_return"] * combined_weight) + (data["esg_rating"] * user_esg_weight)
    scored_options[option] = score

  # Sort options by score (descending)
  sorted_options = sorted(scored_options.items(), key=lambda x: x[1], reverse=True)

  # Recommend top options with equal allocation (replace with more sophisticated logic)
  recommended_portfolio = []
  total_allocation = 1.0
  allocation_per_option = total_allocation / len(sorted_options)
  for option, _ in sorted_options:
    recommended_portfolio.append({"option": option, "allocation": allocation_per_option})

  return recommended_portfolio

# Example usage
username = "user1"
user_data = user_profiles[username]
recommended_portfolio = recommend_portfolio(user_data, investment_options)

print("Recommended Portfolio for", username)
for option in recommended_portfolio:
  print(f"\t{option['option']}: {option['allocation']:.2f}")


Recommended Portfolio for user1
	bond1: 0.50
	stock1: 0.50


In [None]:
# import requests
# from bs4 import BeautifulSoup  # For basic HTML parsing (if needed)
# # Assuming you'll use a mock library for PDF processing (replace with actual library)
# !pip install mock_pdf_processor
# from mock_pdf_processor import process_pdf

# # Sentiment analysis library (replace with chosen library)
# from textblob import TextBlob

# def generate_esg_report(url):
#   """
#   Generates a basic ESG report by analyzing a company's annual report URL (mock implementation).

#   Args:
#       url (str): URL of the company's annual report (PDF or HTML).

#   Returns:
#       dict: Dictionary containing ESG scores (placeholder values in this example).
#   """
#   # Download and process the report content (replace with actual PDF processing)
#   report_text = process_pdf(url)

#   # Basic sentiment analysis (replace with more comprehensive analysis)
#   sentiment = TextBlob(report_text).sentiment

#   # Generate placeholder ESG scores (replace with actual analysis)
#   environment_score = 0.7  # Placeholder value (sentiment analysis can be integrated here)
#   social_score = 0.8  # Placeholder value (sentiment analysis can be integrated here)
#   governance_score = 0.6  # Placeholder value (sentiment analysis can be integrated here)

#   # Combine scores into a dictionary
#   report = {
#       "environment": environment_score,
#       "social": social_score,
#       "governance": governance_score,
#   }

#   return report

# # Example usage
# company_url = "https://aicl-mum-bucket.s3.ap-south-1.amazonaws.com/Production/www-tatamotors-com-NEW/wp-content/uploads/2024/05/tata-motor-IAR-2023-24.pdf"

# esg_report = generate_esg_report(company_url)

# print("ESG Report:")
# print(f"\tEnvironment Score: {esg_report['environment']}")
# print(f"\tSocial Score: {esg_report['social']}")
# print(f"\tGovernance Score: {esg_report['governance']}")



[31mERROR: Could not find a version that satisfies the requirement mock_pdf_processor (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for mock_pdf_processor[0m[31m
[0m

ModuleNotFoundError: No module named 'mock_pdf_processor'

In [None]:
pip install pandas scikit-learn




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the CSV file
file_path = '/content/esg_rating.csv'  # Change this to your actual file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

# Define risk levels based on total ESG score
# Adjusted thresholds for the given range
def assign_risk_level(score):
    if score >= 1200:
        return 'Low'
    elif score >= 900:
        return 'Medium'
    else:
        return 'High'

# Apply the function to create a new column for risk levels
data['risk_level'] = data['total_score'].apply(assign_risk_level)

# Prepare the data for the model
X = data[['total_score']]  # Feature
y = data['risk_level']  # Target

# Encode target labels
y = y.astype('category').cat.codes

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=['High', 'Medium', 'Low']))

# Function to predict risk level for a new ESG score
def predict_risk_level(esg_score):
    scaled_score = scaler.transform([[esg_score]])
    risk_level_code = model.predict(scaled_score)[0]
    risk_levels = ['High', 'Medium', 'Low']
    return risk_levels[risk_level_code]

# Example usage
new_esg_score = 1100
predicted_risk_level = predict_risk_level(new_esg_score)
print(f'The predicted risk level for an ESG score of {new_esg_score} is {predicted_risk_level}')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import numpy as np

# Load the CSV file
file_path = '/content/esg_rating.csv'  # Change this to your actual file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

# Define risk rating based on total ESG score
# Adjusted thresholds for the given range
def assign_risk_rating(score):
    if score >= 1350:
        return 1  # Low risk
    elif score >= 1200:
        return 2
    elif score >= 1050:
        return 3
    elif score >= 900:
        return 4
    else:
        return 5  # High risk

# Apply the function to create a new column for risk ratings
data['risk_rating'] = data['total_score'].apply(assign_risk_rating)

# Prepare the data for the model
X = data[['total_score']]  # Feature
y = data['risk_rating']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Round predictions to the nearest integer and clip values to ensure they fall within the 1-5 range
y_pred_rounded = np.clip(np.round(y_pred), 1, 5)

# Print out prediction results for evaluation
for true_value, pred_value in zip(y_test, y_pred_rounded):
    print(f'True risk rating: {true_value}, Predicted risk rating: {int(pred_value)}')

# Update the original dataframe with predicted risk ratings
data['predicted_risk_rating'] = np.clip(np.round(model.predict(scaler.transform(data[['total_score']]))), 1, 5)

# Save the updated dataframe to a new CSV file
output_file_path = 'esg_ratings_with_risk.csv'  # Change this to your desired output file path
data.to_csv(output_file_path, index=False)

print(f'Updated CSV file saved to {output_file_path}')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = '/content/esg_ratings_with_risk.csv'  # Change this to your actual file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

# Define the feature set and target variable
features = ['total_score', 'environment_score', 'social_score', 'governance_score', 'risk_rating']
X = data[features]
y = data['total_score']  # Assuming total_score can be used as a proxy for the sustainability score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set and evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Function to predict portfolio score based on selected companies
def predict_portfolio_score(selected_companies):
    # Filter the data for selected companies
    selected_data = data[data['name'].isin(selected_companies)]
    if selected_data.empty:
        return "No valid companies selected.", 0

    # Prepare the features for prediction
    selected_features = selected_data[features]
    selected_features_scaled = scaler.transform(selected_features)

    # Predict the sustainability score for each company
    predicted_scores = model.predict(selected_features_scaled)

    # Calculate the portfolio score (average score of selected companies)
    portfolio_score = np.mean(predicted_scores)

    # Determine if the portfolio is sustainable
    sustainability_status = "Sustainable" if portfolio_score >= 75 else "Not Sustainable"

    return sustainability_status, portfolio_score

# Example usage
selected_companies = ['Alphabet Inc', 'Marriott International Inc', 'Lam Research Corp']  # Change these to actual company names from your dataset
status, score = predict_portfolio_score(selected_companies)
print(f'The portfolio is {status} with a score of {score:.2f}')

# Save the model and scaler for future use
import joblib
joblib.dump(model, 'portfolio_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l

['scaler.pkl']

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

# Load the dataset
file_path = '/content/esg_ratings_with_risk.csv'  # Change this to your actual file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

# Define the feature set and target variable
features = ['total_score', 'environment_score', 'social_score', 'governance_score', 'risk_rating']
X = data[features]
y = data['total_score']  # Assuming total_score can be used as a proxy for the sustainability score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set and evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Function to predict portfolio score based on selected companies
def predict_portfolio_score(selected_companies):
    # Filter the data for selected companies
    selected_data = data[data['name'].isin(selected_companies)]
    if selected_data.empty:
        return "No valid companies selected.", 0

    # Prepare the features for prediction
    selected_features = selected_data[features]
    selected_features_scaled = scaler.transform(selected_features)

    # Predict the sustainability score for each company
    predicted_scores = model.predict(selected_features_scaled)

    # Calculate the portfolio score (average score of selected companies)
    portfolio_score = np.mean(predicted_scores)

    # Convert the portfolio score to an integer
    portfolio_score_int = int(portfolio_score)

    # Determine if the portfolio is sustainable
    sustainability_status = "Sustainable" if portfolio_score_int >= 75 else "Not Sustainable"

    return sustainability_status, portfolio_score_int

# Example usage
selected_companies = ['Alphabet Inc', 'Marriott International Inc', 'Lam Research Corp']  # Change these to actual company names from your dataset
status, score = predict_portfolio_score(selected_companies)
print(f'The portfolio is {status} with a score of {score}')

# Save the model and scaler for future use
joblib.dump(model, 'portfolio_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l

['scaler.pkl']

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import joblib

# Load the dataset
file_path = '/content/esg_ratings_with_risk.csv'  # Change this to your actual file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

# Define the feature set and target variable
features = ['total_score', 'environment_score', 'social_score', 'governance_score', 'risk_rating']
X = data[features]
y = data['total_score']  # Assuming total_score can be used as a proxy for the sustainability score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Support Vector Regression (SVR) model
model = SVR(kernel='linear')
model.fit(X_train_scaled, y_train)

# Predict on the test set and evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Function to predict portfolio score based on selected companies
def predict_portfolio_score(selected_companies):
    # Filter the data for selected companies
    selected_data = data[data['name'].isin(selected_companies)]
    if selected_data.empty:
        return "No valid companies selected.", 0

    # Prepare the features for prediction
    selected_features = selected_data[features]
    selected_features_scaled = scaler.transform(selected_features)

    # Predict the sustainability score for each company
    predicted_scores = model.predict(selected_features_scaled)

    # Calculate the portfolio score (average score of selected companies)
    portfolio_score = np.mean(predicted_scores)

    # Convert the portfolio score to an integer
    portfolio_score_int = int(portfolio_score)

    # Determine if the portfolio is sustainable
    sustainability_status = "Sustainable" if portfolio_score_int >= 750 else "Not Sustainable"

    return sustainability_status, portfolio_score_int

# Example usage
selected_companies = ['Alphabet Inc', 'Microsoft Corp', 'Lam Research Corp']  # Change these to actual company names from your dataset
status, score = predict_portfolio_score(selected_companies)
print(f'The portfolio is {status} with a score of {score}')

# Save the model and scaler for future use
joblib.dump(model, 'portfolio_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l

['scaler.pkl']

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error
# import joblib

# # Load the dataset
# file_path = '/content/esg_ratings_with_risk.csv'  # Change this to your actual file path
# data = pd.read_csv(file_path)

# # Display the first few rows of the dataframe
# print(data.head())

# # Define the feature set and target variable
# features = ['total_score', 'environment_score', 'social_score', 'governance_score', 'risk_rating']
# X = data[features]
# y = data['total_score']  # Assuming total_score can be used as a proxy for the sustainability score

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Standardize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Train a Support Vector Regression (SVR) model with a polynomial kernel of degree 3
# model = SVR(kernel='poly', degree=3)
# model.fit(X_train_scaled, y_train)

# # Predict on the test set and evaluate the model
# y_pred = model.predict(X_test_scaled)
# mse = mean_squared_error(y_test, y_pred)
# print(f'Mean Squared Error: {mse}')

# # Function to predict portfolio score based on selected companies
# def predict_portfolio_score(selected_companies):
#     # Filter the data for selected companies
#     selected_data = data[data['name'].isin(selected_companies)]
#     if selected_data.empty:
#         return "No valid companies selected.", 0

#     # Prepare the features for prediction
#     selected_features = selected_data[features]
#     selected_features_scaled = scaler.transform(selected_features)

#     # Predict the sustainability score for each company
#     predicted_scores = model.predict(selected_features_scaled)

#     # Calculate the portfolio score (average score of selected companies)
#     portfolio_score = np.mean(predicted_scores)

#     # Convert the portfolio score to an integer
#     portfolio_score_int = int(portfolio_score)

#     # Determine if the portfolio is sustainable
#     sustainability_status = "Sustainable" if portfolio_score_int >= 75 else "Not Sustainable"

#     return sustainability_status, portfolio_score_int

# # Example usage
# selected_companies = ['Alphabet Inc', 'Microsoft Corp', 'Lam Research Corp']  # Change these to actual company names from your dataset
# status, score = predict_portfolio_score(selected_companies)
# print(f'The portfolio is {status} with a score of {score}')

# # Save the model and scaler for future use
# joblib.dump(model, 'portfolio_model.pkl')
# joblib.dump(scaler, 'scaler.pkl')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l

['scaler.pkl']

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error
# import joblib

# # Load the dataset
# file_path = '/content/esg_ratings_with_risk.csv'  # Change this to your actual file path
# data = pd.read_csv(file_path)

# # Display the first few rows of the dataframe
# print(data.head())

# # Define the feature set and target variable
# features = ['total_score', 'environment_score', 'social_score', 'governance_score', 'risk_rating']
# X = data[features]
# y = data['total_score']  # Assuming total_score can be used as a proxy for the sustainability score

# # Normalize the target variable to the range [1, 100]
# y = MinMaxScaler(feature_range=(1, 100)).fit_transform(y.values.reshape(-1, 1)).flatten()

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Standardize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Train a Support Vector Regression (SVR) model with a polynomial kernel of degree 3
# model = SVR(kernel='poly', degree=3)
# model.fit(X_train_scaled, y_train)

# # Predict on the test set and evaluate the model
# y_pred = model.predict(X_test_scaled)
# mse = mean_squared_error(y_test, y_pred)
# print(f'Mean Squared Error: {mse}')

# # Function to predict portfolio score based on selected companies
# def predict_portfolio_score(selected_companies):
#     # Filter the data for selected companies
#     selected_data = data[data['name'].isin(selected_companies)]
#     if selected_data.empty:
#         return "No valid companies selected.", 0

#     # Prepare the features for prediction
#     selected_features = selected_data[features]
#     selected_features_scaled = scaler.transform(selected_features)

#     # Predict the sustainability score for each company
#     predicted_scores = model.predict(selected_features_scaled)

#     # Calculate the portfolio score (average score of selected companies)
#     portfolio_score = np.mean(predicted_scores)

#     # Ensure the portfolio score is an integer within 1 to 100
#     portfolio_score_int = int(portfolio_score)

#     # Determine if the portfolio is sustainable
#     sustainability_status = "Sustainable" if portfolio_score_int >= 75 else "Not Sustainable"

#     return sustainability_status, portfolio_score_int

# # Example usage
# selected_companies = ['Alphabet Inc', 'Microsoft Corp', 'Lam Research Corp']  # Change these to actual company names from your dataset
# status, score = predict_portfolio_score(selected_companies)
# print(f'The portfolio is {status} with a score of {score}')

# # Save the model and scaler for future use
# joblib.dump(model, 'portfolio_model.pkl')
# joblib.dump(scaler, 'scaler.pkl')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l

['scaler.pkl']

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import joblib

# Load the dataset
file_path = '/content/esg_ratings_with_risk.csv'  # Change this to your actual file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

# Define the feature set and target variable
features = ['total_score', 'environment_score', 'social_score', 'governance_score', 'risk_rating']
X = data[features]
y = data['total_score']  # Assuming total_score can be used as a proxy for the sustainability score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Support Vector Regression (SVR) model
model = SVR(kernel='linear')
model.fit(X_train_scaled, y_train)

# Predict on the test set and evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Function to predict portfolio score based on selected companies
def predict_portfolio_score(selected_companies):
    # Filter the data for selected companies
    selected_data = data[data['name'].isin(selected_companies)]
    if selected_data.empty:
        return "No valid companies selected.", 0

    # Prepare the features for prediction
    selected_features = selected_data[features]
    selected_features_scaled = scaler.transform(selected_features)

    # Predict the sustainability score for each company
    predicted_scores = model.predict(selected_features_scaled)

    # Calculate the portfolio score (average score of selected companies)
    portfolio_score = np.mean(predicted_scores)

    # Convert the portfolio score to an integer
    portfolio_score_int = int(portfolio_score)

    # Determine if the portfolio is sustainable
    sustainability_status = "Sustainable" if portfolio_score_int >= 975 else "Not Sustainable"

    return sustainability_status, portfolio_score_int

# Function to get user input and predict the portfolio score
def main():
    selected_companies = input("Enter the company names separated by commas: ").split(',')
    selected_companies = [company.strip() for company in selected_companies]
    status, score = predict_portfolio_score(selected_companies)
    print(f'The portfolio is {status} with a score of {score}')

# Run the main function
if __name__ == "__main__":
    main()

# Save the model and scaler for future use
joblib.dump(model, 'portfolio_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l

KeyboardInterrupt: Interrupted by user

FileNotFoundError: [Errno 2] No such file or directory: 'model/portfolio_model.pkl'

In [None]:
!pip install --upgrade scikit-learn==1.2.2




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

# Load the dataset
file_path = '/content/esg_ratings_with_risk.csv'  # Change this to your actual file path
data = pd.read_csv(file_path)

# Define the feature set and target variable
features = ['environment_score', 'social_score', 'governance_score', 'risk_rating']
X = data[features]
y = data['total_score']  # Assuming total_score can be used as a proxy for the sustainability score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set and evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Save the model and scaler for future use
joblib.dump(model, 'portfolio_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


Mean Squared Error: 2.4155813146753286e-26


['scaler.pkl']

In [None]:
from flask import Flask, request, jsonify
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import joblib

app = Flask(__name__)

# Load the model and scaler
model = joblib.load('model/portfolio_model.pkl')
scaler = joblib.load('model/scaler.pkl')

# Load the dataset
file_path = 'data/esg_ratings_with_risk.csv'  # Ensure this path is correct relative to your Flask app
data = pd.read_csv(file_path)

# Define the feature set
features = ['total_score', 'environment_score', 'social_score', 'governance_score', 'risk_rating']

# Function to predict portfolio score based on selected companies
def predict_portfolio_score(selected_companies):
    # Filter the data for selected companies
    selected_data = data[data['name'].isin(selected_companies)]
    if selected_data.empty:
        return "No valid companies selected.", 0

    # Prepare the features for prediction
    selected_features = selected_data[features]
    selected_features_scaled = scaler.transform(selected_features)

    # Predict the sustainability score for each company
    predicted_scores = model.predict(selected_features_scaled)

    # Calculate the portfolio score (average score of selected companies)
    portfolio_score = np.mean(predicted_scores)

    # Convert the portfolio score to an integer
    portfolio_score_int = int(portfolio_score)

    # Ensure the portfolio score is within 1 to 100
    portfolio_score_int = max(1, min(portfolio_score_int, 100))

    # Determine if the portfolio is sustainable
    sustainability_status = "Sustainable" if portfolio_score_int >= 75 else "Not Sustainable"

    return sustainability_status, portfolio_score_int

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    selected_companies = data.get('companies')

    if not selected_companies:
        return jsonify({"error": "No companies provided"}), 400

    status, score = predict_portfolio_score(selected_companies)
    return jsonify({"status": status, "score": score})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


FileNotFoundError: [Errno 2] No such file or directory: 'model/portfolio_model.pkl'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import joblib

# Load the dataset
file_path = '/content/esg_ratings_with_risk.csv'
data = pd.read_csv(file_path)

# Define the feature set and target variable
features = ['environment_score', 'social_score', 'governance_score', 'risk_rating']
X = data[features]
y = data['total_score']  # target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Save the model and scaler for future use
joblib.dump(model, 'model/portfolio_model.pkl')
joblib.dump(scaler, 'model/scaler.pkl')


FileNotFoundError: [Errno 2] No such file or directory: 'model/portfolio_model.pkl'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import joblib

# Load the dataset
file_path = '/content/esg_ratings_with_risk.csv'
data = pd.read_csv(file_path)

# Define the feature set and target variable
features = ['environment_score', 'social_score', 'governance_score', 'risk_rating']
X = data[features]
y = data['total_score']  # target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train an SVR model
model = SVR(kernel='linear')
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Save the model and scaler for future use
joblib.dump(model, 'portfolio_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


Mean Squared Error: 0.25390256902110075


['scaler.pkl']

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import joblib

# Load the dataset
file_path = '/content/esg_ratings_with_risk.csv'  # Update with your actual file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

# Define the feature set and target variable
features = ['total_score', 'environment_score', 'social_score', 'governance_score', 'risk_rating']
X = data[features]
y = data['total_score']  # Assuming total_score can be used as a proxy for the sustainability score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Support Vector Regression (SVR) model
model = SVR(kernel='linear')
model.fit(X_train_scaled, y_train)

# Predict on the test set and evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Save the model and scaler for future use
joblib.dump(model, 'portfolio_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


  ticker                           name currency  \
0    dis                 Walt Disney Co      USD   
1     gm              General Motors Co      USD   
2    gww                WW Grainger Inc      USD   
3    mhk          Mohawk Industries Inc      USD   
4    lyv  Live Nation Entertainment Inc      USD   

                        exchange                            industry  \
0  NEW YORK STOCK EXCHANGE, INC.                               Media   
1  NEW YORK STOCK EXCHANGE, INC.                         Automobiles   
2  NEW YORK STOCK EXCHANGE, INC.  Trading Companies and Distributors   
3  NEW YORK STOCK EXCHANGE, INC.                   Consumer products   
4  NEW YORK STOCK EXCHANGE, INC.                               Media   

                                                logo  \
0  https://static.finnhub.io/logo/ef50b4a2b263c84...   
1  https://static.finnhub.io/logo/9253db78-80c9-1...   
2  https://static.finnhub.io/logo/f153dcda-80eb-1...   
3  https://static.finnhub.io/l

['scaler.pkl']

In [None]:
from flask import Flask, request, jsonify
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import joblib

app = Flask(__name__)

# Load the model and scaler
model = joblib.load('portfolio_model.pkl')
scaler = joblib.load('scaler.pkl')

# Load the dataset (assuming it includes 'name' column)
file_path = '/data/esg_ratings_with_risk.csv'  # Update with your actual file path
data = pd.read_csv(file_path)

# Define the features used in training
features = ['environment_score', 'social_score', 'governance_score', 'risk_rating']

# Function to predict portfolio score based on selected companies
def predict_portfolio_score(selected_companies):
    # Filter the data for selected companies
    selected_data = data[data['name'].isin(selected_companies)]
    if selected_data.empty:
        return "No valid companies selected.", 0

    # Prepare the features for prediction
    selected_features = selected_data[features]
    selected_features_scaled = scaler.transform(selected_features)

    # Predict the sustainability score for each company
    predicted_scores = model.predict(selected_features_scaled)

    # Calculate the portfolio score (average score of selected companies)
    portfolio_score = np.mean(predicted_scores)

    # Convert the portfolio score to an integer
    portfolio_score_int = int(portfolio_score)

    # Determine if the portfolio is sustainable
    sustainability_status = "Sustainable" if portfolio_score_int >= 975 else "Not Sustainable"

    return sustainability_status, portfolio_score_int

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    selected_companies = data.get('companies')

    if not selected_companies:
        return jsonify({"error": "No companies provided"}), 400

    status, score = predict_portfolio_score(selected_companies)
    return jsonify({"status": status, "score": score})

@app.route('/', methods=['GET'])
def home():
    return jsonify({"message": "Connected successfully"})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [None]:
!pip install tika
!pip install spacy
!pip install transformers


Collecting tika
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tika
  Building wheel for tika (setup.py) ... [?25l[?25hdone
  Created wheel for tika: filename=tika-2.6.0-py3-none-any.whl size=32621 sha256=e1eef28975b05e001d5c424d2124a1bd6745e98a8fcdfc7a715ee24f334dc236
  Stored in directory: /root/.cache/pip/wheels/5f/71/c7/b757709531121b1700cffda5b6b0d4aad095fb507ec84316d0
Successfully built tika
Installing collected packages: tika
Successfully installed tika-2.6.0


In [None]:
from flask import Flask, request, jsonify
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from tika import parser
import spacy
import pandas as pd

app = Flask(__name__)

# Load the Environmental BERT model and tokenizer from Hugging Face
env_name = "ESGBERT/EnvironmentalBERT-environmental"
env_tokenizer = AutoTokenizer.from_pretrained(env_name)
env_model = AutoModelForSequenceClassification.from_pretrained(env_name)
env_pipe = pipeline("text-classification", model=env_model, tokenizer=env_tokenizer)

# Load the sentiment analysis model
sentiment_name = "ProsusAI/finbert"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_name)
sentiment_pipe = pipeline("text-classification", model=sentiment_model, tokenizer=sentiment_tokenizer)

# Function to parse document content using Apache Tika
def parse_document_content(path):
    raw_content = parser.from_file(path, serverEndpoint='http://localhost:8000')
    return raw_content["content"]

# Function to process sentences for classification and sentiment analysis
def process_sentences(sentences, classification_pipe, sentiment_pipe):
    # Classify environmental sentences
    classification_results = classification_pipe(sentences, padding=True, truncation=True)
    classification_labels = [x["label"] for x in classification_results]

    # Perform sentiment analysis on the same sentences
    sentiment_results = sentiment_pipe(sentences, padding=True, truncation=True)
    sentiment_labels = [x["label"] for x in sentiment_results]

    # Prepare a DataFrame to store results
    data = pd.DataFrame({
        "sentence": sentences,
        "classification_label": classification_labels,
        "sentiment_label": sentiment_labels
    })

    return data

# Route to upload and process a document
@app.route('/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    try:
        # Save the uploaded file temporarily (adjust as needed)
        file_path = '/tmp/uploaded_file.pdf'
        file.save(file_path)

        # Parse document content using Tika
        document_content = parse_document_content(file_path)

        # Process sentences for classification and sentiment analysis
        nlp = spacy.load('en_core_web_sm')
        nlp.max_length = 10000000  # Set a higher limit as needed
        about_doc = nlp(document_content)
        sequences = list(map(str, about_doc.sents))
        sentences = [x.replace("\n", "") for x in sequences if x != "" and x[0].isupper()]

        # Limit the number of sentences for processing (adjust as needed)
        sub_sentences = sentences[:100]

        # Process sentences for environmental classification and sentiment analysis
        env_data = process_sentences(sub_sentences, env_pipe, sentiment_pipe)

        # Return the processed data as JSON
        return env_data.to_json(orient='records'), 200

    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=6000)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/951 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
