In [20]:
import pickle
import pandas as pd
from tqdm import tqdm
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForTokenClassification
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

tqdm.pandas()

: 

In [21]:
def load_models():
    sentiment_analyzer = pipeline(
        model=AutoModelForSequenceClassification.from_pretrained("sentiment_model"),
        tokenizer=AutoTokenizer.from_pretrained("sentiment_model")
    )
    ner_model = pipeline(
        "ner",
        model=AutoModelForTokenClassification.from_pretrained("ner_model"),
        tokenizer=AutoTokenizer.from_pretrained("ner_model"),
        aggregation_strategy="simple"
    )
    return sentiment_analyzer, ner_model
def preprocess_and_stem(text):
    # Initialize the stemmer
    stemmer = PorterStemmer()
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Apply stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a single string
    processed_text = ' '.join(stemmed_tokens)
    
    return processed_text

def feature_extraction(df):
    sentiment_analyzer, ner_model = load_models()

    # Apply preprocessing and stemming
    df["Processed"] = df["Combined"].progress_apply(preprocess_and_stem)

    # Apply sentiment analysis with progress bar
    df["sentiment"] = df["Processed"].progress_apply(lambda x: sentiment_analyzer(x)[0]['label'])

    # Apply NER with progress bar
    df["ner_entities"] = df["Processed"].progress_apply(ner_model)

    sentiment_map = {"POSITIVE": 1, "NEGATIVE": 0}
    df['sentiment_numeric'] = df['sentiment'].map(sentiment_map)

    injury_terms = ["ankle", "knee", "hamstring", "groin", "thigh", "head", "concussion", "neck", "shoulder"]
    for term in injury_terms:
        df[f'{term}_injury'] = df['ner_entities'].progress_apply(
            lambda entities: any(term in e['word'] for e in entities)
        )

    return df[["Combined", "Processed", "sentiment", "sentiment_numeric", "ner_entities", "ankle_injury", "knee_injury", "hamstring_injury", "groin_injury", "thigh_injury", "head_injury", 
               "concussion_injury", "neck_injury", "shoulder_injury"]]


ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [3]:
with open('playersWithInjurie10-2-24.pickle', 'rb') as handle:
    all_injuries_df = pickle.load(handle)


In [4]:
temp = feature_extraction(all_injuries_df)

I0000 00:00:1727906993.535082    5394 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-10-02 22:09:53.579811: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertFo

In [8]:
with open('temp10-3-24-2.pickle', 'wb') as handle:
    pickle.dump(temp,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [13]:


# Download NLTK data (run this once)
# nltk.download('punkt')
# nltk.download('stopwords')


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
def save_models():
    # Save sentiment analysis model
    sentiment_analyzer = pipeline(
        model="lxyuan/distilbert-base-multilingual-cased-sentiments-student"
    )
    sentiment_analyzer.model.save_pretrained("sentiment_model2")
    sentiment_analyzer.tokenizer.save_pretrained("sentiment_model2")

    # Save NER model
    ner_model = pipeline("ner", aggregation_strategy="simple")
    ner_model.model.save_pretrained("ner_model")
    ner_model.tokenizer.save_pretrained("ner_model")

# Call save_models once to save the models
save_models()

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.
