<a href="https://colab.research.google.com/github/Opperdraak/Industry_Guide/blob/main/Industry_Guide_Linkedin_Post.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages
!pip install openai==0.28
!pip install pandas requests
!pip install tqdm

# Import required libraries
import pandas as pd
import openai
import requests
import re
from tqdm import tqdm
import time
import numpy as np
from google.colab import drive
from google.colab import files

# Mount Google Drive so you can read files in your Drive
drive.mount('drive')

# Set OpenAI API key (Replace "GET-YOUR-OWN-API-KEY" with your actual API key)
openai.api_key = "GET-YOUR-OWN-API-KEY"


In [None]:
# Change the iteration number to select a different subset of data
df = pd.read_csv('/content/drive/MyDrive/Location for your file/your_file.csv')

#If your file has more then 1000 entries, you want to split it up into
#different chunks (5 in this example) and run the code one by one
df = np.array_split(df, 5)
iteration = 0

# Define a function to classify the industry of a company using the OpenAI API
def classify_company_industry(company_name):
    retries = 3
    industry = None

    while retries > 0:
        messages = [
            {"role": "system", "content": "You are an AI language model trained to analyze and detect the industry of company names based on the name itself, but also on the domain of their email."},
            {"role": "user", "content": f"Analyze the following company list and determine what industry they are working in. Return 2 words at most. {company_name}"}
        ]
        completion = openai.ChatCompletion.create(
            model="gpt-4",
            messages=messages,
            max_tokens=3,
            n=1,
            stop=None,
            temperature=0
        )

        response_text = completion.choices[0].message.content
        print(response_text)

        industry = response_text
        break

    time.sleep(0.5)

    return industry

# Select the subset of data for processing
df = df[iteration]

# Classify the industry for each company in the selected subset
industries = []

for company_name in tqdm(df["company_name"], desc="Classifying industries"):
    predicted_industry = classify_company_industry(company_name)
    industries.append(predicted_industry)

# Create a DataFrame to store the results
results_df = pd.DataFrame({
    'company_name': df['company_name'],
    'company_id' : df['company_id'],
    'predicted_industry': industries
})

# Reset the index of the DataFrame
results_df.reset_index(drop=True, inplace=True)

# Display the results DataFrame
results_df

In [None]:
#Now we will clean up some of the results, please add more to this if you find more errors.
#Iterate over each row in the results DataFrame
for x in range(len(results_df)):
    # Get the predicted industry and convert it to lowercase for easier comparison
    industry_prediction = results_df["predicted_industry"][x].lower()

    # Check if the predicted industry contains certain keywords and update it accordingly
    if "automotive" in industry_prediction:
        results_df["predicted_industry"][x] = "Automotive"
    elif "art" in industry_prediction or "arts" in industry_prediction:
        results_df["predicted_industry"][x] = "Art"
    elif "aviation" in industry_prediction or "aerospace" in industry_prediction:
        results_df["predicted_industry"][x] = "Aviation"
    elif "construction" in industry_prediction:
        results_df["predicted_industry"][x] = "Construction"


In [None]:
# Get unique values of predicted industries where general_industry is empty
unique_values = df[df['general_industry'].eq('')]['predicted_industry'].unique()

# Convert the unique values to a list
unique_values_list = list(unique_values)

# Define a function to group detailed industries into general industry categories
def group_industries_detailed_to_general(detailed_industries):
    industry_mapping = {}

    for detailed_industry in detailed_industries:
        retries = 3
        response_text = None

        while retries > 0:
            messages = [
                {"role": "system", "content": "You are an AI language model. Trained in grouping industries into general industry categories."},
                {"role": "user", "content": f"Group the industry '{detailed_industry}' into general industry categories. Use 1 word max per industry, no special characters like commas, dashes, slashes, or dots."}
            ]

            completion = openai.ChatCompletion.create(
                model="gpt-4",
                messages=messages,
                max_tokens=2,
                n=1,
                stop=None,
                temperature=0.5
            )

            response_text = completion.choices[0].message['content'].strip()
            print(f"{detailed_industry} -> {response_text}")

            # Check if a valid response is obtained
            if response_text:
                break

            # Decrement retries
            retries -= 1

        # Store the mapping
        industry_mapping[detailed_industry] = response_text
        time.sleep(0.5)

    return industry_mapping

# Example usage: Group unique predicted industries into general categories
unique_predicted_industries = unique_values_list
industry_mapping = group_industries_detailed_to_general(unique_predicted_industries)

# Function to map values based on the industry mapping
def map_industry(row):
    if row['general_industry'] == '':  # Check if general_industry is an empty string
        return industry_mapping.get(row['predicted_industry'], '')
    else:
        return row['general_industry']  # If general_industry already has a value, keep it unchanged

# Apply the function to create a new column 'general_industry' based on the mapping
df['general_industry'] = df.apply(map_industry, axis=1)


In [None]:
# Iterate over each row in the DataFrame
for x in range(len(df)):
    # Convert the general industry to lowercase for easier comparison
    industry = str(df["general_industry"][x]).lower()

    # Check for keywords in the industry and update it accordingly
    if "property" in industry or "reale" in industry or "housing" in industry:
        df["general_industry"][x] = "Real Estate"
    elif "aquac" in industry:
        df["general_industry"][x] = "Science"
    elif "legal" in industry:
        df["general_industry"][x] = "Legal Services"
    elif "banking" in industry:
        df["general_industry"][x] = "Finance"
    elif "healthcare" in industry or "health" in industry:
        df["general_industry"][x] = "Fitness & Health"
    elif "food" in industry or "bever" in industry:
        df["general_industry"][x] = "Consumption"
    elif "uncategorized" in industry or "apologies" in industry or "miscellaneous" in industry or "uncertain" in industry or "the industry" in industry or "your query" in industry:
        df["general_industry"][x] = "Indeterminable"

# Save the DataFrame to a CSV file
df.to_csv(f'Industry_prediction_fin.csv')
