In [38]:
from google.cloud import language_v1
import pandas as pd

In [39]:
df_nlp = pd.read_csv('/Users/clara/Desktop/neuefische/d-drivers/data/preprocessing_nlp.csv')

In [40]:
def sample_classify_text(text_content):
    """
    Classifying Content in a String

    Args:
      text_content The text content to analyze.
    """

    client = language_v1.LanguageServiceClient()

    # Available types: PLAIN_TEXT, HTML
    type_ = language_v1.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "de"
    document = {"content": text_content, "type_": type_, "language": language}

    content_categories_version = (
        language_v1.ClassificationModelOptions.V2Model.ContentCategoriesVersion.V2
    )
    response = client.classify_text(
        request={
            "document": document,
            "classification_model_options": {
                "v2_model": {"content_categories_version": content_categories_version}
            },
        }
    )
    # Loop through classified categories returned from the API
    for category in response.categories:
        # Get the name of the category representing the document.
        # See the predefined taxonomy of categories:
        # https://cloud.google.com/natural-language/docs/categories
        return("Category name: {}".format(category.name), "Confidence: {}".format(category.confidence))
        # Get the confidence. Number representing how certain the classifier
        # is that this category represents the provided text.

In [41]:
def classify_and_extract(text_content):
    """
    Classify the text content and extract category name and confidence.

    Args:
        text_content: The text content to classify.

    Returns:
        A tuple containing the category name and confidence.
    """
    try:
        result = sample_classify_text(text_content)
        category_name = result[0].split(": ")[1]
        confidence = float(result[1].split(": ")[1])
        return category_name, confidence
    except Exception as e:
        print(f"Error classifying text: {e}")
        return None, None

# Apply the function to each row in the 'h1' column
df_test = df_nlp[0:30]
df_test[['Category', 'Confidence']] = df_test['h1'].apply(lambda x: pd.Series(classify_and_extract(x)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[['Category', 'Confidence']] = df_test['h1'].apply(lambda x: pd.Series(classify_and_extract(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[['Category', 'Confidence']] = df_test['h1'].apply(lambda x: pd.Series(classify_and_extract(x)))


Let's have a look at the results:

In [49]:
pd.set_option('display.max_colwidth', None)
df_test[['h1','Category','Confidence']].query('Confidence > 0.8')

Unnamed: 0,h1,Category,Confidence
0,Elektrofahrzeug-Ladestation : Kosten Anbieter Vergleich,/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,0.835037
1,Elektroauto-Förderung 2024 : satt absahnen -,/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,0.928177
2,Hätten gewusst ? lange gibt schon E-Autos,/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,0.896956
5,E-Auto Kosten : laufenden Ausgaben müssen rechnen,/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,0.92792
9,"Plug-in-Hybrid kaufen : Autos Strom , Sprit Reichweitenangst",/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,0.990756
11,"Weit , , weitesten : E-Autos größten Reichweite",/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,0.81199
12,Schon morgen Garage : fünf E-Autos sofort verfügbar,/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,0.930591
14,Deutschland zahlt Förderung E-Autos : Länder,/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,0.866669
15,Premium-Ansprüche Korea ? Elektroautos Kia Überblick,/Autos & Vehicles/Motor Vehicles (By Brand)/Kia,0.840516
25,Elektroautos Fahranfänger Einsteiger : besten einfachen E-Autos 2022,/Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles,1.0


- 12 of 30 labels have a higher confidence than 0.8
- Upon closer investigation, no label is wrongly assigned. However some categories could be more detailed 
    - e.g. Elektroautos Fahranfänger Einsteiger : besten einfachen E-Autos 2022 -->  /Autos & Vehicles/Motor Vehicles (By Type)/Hybrid & Alternative Vehicles 
    - but Premium-Ansprüche Korea ? Elektroautos Kia Überblick	-->  /Autos & Vehicles/Motor Vehicles (By Brand)/Kia	

In [50]:
pd.set_option('display.max_colwidth', None)
df_test[['h1','Category','Confidence']].query('Confidence < 0.3')

Unnamed: 0,h1,Category,Confidence
3,Kleinstes Auto Welt kommt zurück : Fans selber bauen,"/Reference/General Reference/How-To, DIY & Expert Content",0.172663
6,"Auto gratis laden adieu : kostet Strom Aldi , Lidl , Rewe Co",/Food & Drink/Food & Grocery Retailers/Other,0.239954
20,stehen Chancen grünen Sprit : Wasserstoff Tank,/Autos & Vehicles/Vehicle Parts & Services/Gas Prices & Vehicle Fueling,0.224163
21,Gurkenwasser statt Streusalz : Abfallprodukt Develey Bayern versprüht,/Business & Industrial/Energy & Utilities/Waste Management,0.195685
23,Wallbox 11kW 22 kW : ADAC-Testsieger,/Business & Industrial/Industrial Materials & Equipment/Generators,0.118666
