In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="imports"></a>
# Imports

following libraries are used for:
1. re - remove unwanted charater from string using regulare expression
2. nltk - remove stop words and conjunctions
3. numpy - transform data into respective shape
4. pandas - import data from file into dataframe
5. matplotlib - create visualization
6. wordcloud - create word cloud

# **Resume Screening**

In [None]:
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

nltk.download("stopwords")

<a id="read-csv"></a>
# Read CSV

ID and Resume_html columns does not contain useful information and are not aligend with my interest. Therefore, both columns are removed.

In [None]:
df = pd.read_csv('/kaggle/input/resume-dataset/Resume/Resume.csv')

df.head()

In [None]:
# drop unused columns
del df['ID']
del df['Resume_html']
df

In [None]:
df.rename(columns = {'Resume_str':'Resume'}, inplace = True) 
df

<a id="preprocessing"></a>
# Preprocessing

In this stage, I followed basic cleaning processes for text analysis which includes:
1. converting characters to lowercases.
2. remove punctuations, numbers and foreign languages.
3. tokenize word. (spliting sentence into unigram)


In [None]:
def preprocess(txt):
    # convert all characters in the string to lower case
    txt = txt.lower()
    # remove non-english characters, punctuation and numbers
    txt = re.sub('[^a-zA-Z]', ' ', txt)
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace
    # tokenize word
    txt = nltk.tokenize.word_tokenize(txt)
    # remove stop words
    txt = [w for w in txt if not w in nltk.corpus.stopwords.words('english')]
    

    return ' '.join(txt)

In [None]:
# preprocessing text
df['Resume'] = df['Resume'].apply(lambda w: preprocess(w))


In [None]:
df

<a id="exploratory-data-analysis"></a>
# Exploratory Data Analysis

EDA is excercised to inspect class imbalance, word similarity and word frequency.

In [None]:
import string
from nltk.corpus import stopwords
oneSetOfStopWords = set(stopwords.words('english')+['``',"''"])
totalWords =[]
Sentences = df['Resume'].values
cleanedSentences = ""
for records in Sentences:
    cleanedSentences += records
    requiredWords = nltk.word_tokenize(records)
    for word in requiredWords:
        if word not in oneSetOfStopWords and word not in string.punctuation:
            totalWords.append(word)
    
wordfreqdist = nltk.FreqDist(totalWords)
mostcommon = wordfreqdist.most_common(50)
print(mostcommon)

In [None]:
wc = WordCloud().generate(cleanedSentences)
plt.figure(figsize=(10,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# create list of all categories
categories = np.sort(df['Category'].unique())
categories

In [None]:
# create new df for corpus and category
df_categories = [df[df['Category'] == category].loc[:, ['Resume', 'Category']] for category in categories]
df_categories

<a id="barchart"></a>
## Bar Chart

The bar chart shows the number of records for each category, where class imbalanced is spotted easily.

In [None]:
df['Category'].value_counts().sort_index().plot(kind='bar', figsize=(12, 6))
plt.show()

<a id="wordcloud"></a>
## Word Cloud

After word clouds are created, the word "manag" (inflected for "manage") prominently visible on many categories. Moreover, words like "citi", "state" and "compani" are noticeable on different categories as well. These common words are likely to contain low weight for computation. On the other hand, words like "account", "develop" and "design" probably has higher weight for calculation because it only appears on specific domains.

In [None]:
def wordcloud(df):
    txt = ' '.join(txt for txt in df['Resume'])
    wordcloud = WordCloud(
        height=2000,
        width=4000,
        colormap=WORDCLOUD_COLOR_MAP
    ).generate(txt)

    return wordcloud

In [None]:
WORDCLOUD_COLOR_MAP = 'tab10_r'
plt.figure(figsize=(32, 28))

for i, category in enumerate(categories):
    wc = wordcloud(df_categories[i])

    plt.subplot(5, 5, i + 1).set_title(category)
    plt.imshow(wc)
    plt.axis('off')
    plt.plot()

plt.show()
plt.close()

from sklearn.preprocessing import LabelEncoder

var_mod = ['Category']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i])<a id="wordfreq"></a>
## Word Frequency Table

Word frequency is plotted to visualize how often popular words are used. Most categories have similar distribution on top 10 frequently used words.

In [None]:
def wordfreq(df):
    count = df['Resume'].str.split(expand=True).stack().value_counts().reset_index()
    count.columns = ['Word', 'Frequency']

    return count.head(10)

In [None]:
fig = plt.figure(figsize=(32, 64))

for i, category in enumerate(categories):
    wf = wordfreq(df_categories[i])

    fig.add_subplot(5, 5, i + 1).set_title(category)
    plt.bar(wf['Word'], wf['Frequency'])
    plt.ylim(0, 3500)

plt.show()
plt.close()

In [None]:
from sklearn.preprocessing import LabelEncoder

var_mod = ['Category']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i])

In [None]:
df

In [None]:
df.Category.value_counts()


In [None]:
#Remove Category with a few records (implanced data)
df = df[df.Category != 2]
df = df[df.Category != 5]
df = df[df.Category != 8]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

requiredText = df['Resume'].values
requiredTarget = df['Category'].values

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english')
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

print ("Feature completed .....")



In [None]:
X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=42, test_size=0.2,
                                                 shuffle=True, stratify=requiredTarget)
print(X_train.shape)
print(X_test.shape)

In [None]:
clf = OneVsRestClassifier(LogisticRegression(random_state=16))
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of KNeighbors Classifier on test set:     {:.2f}'.format(clf.score(X_test, y_test)))

In [None]:
print("\n Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(y_test, prediction)))

In [None]:
clf1 = OneVsRestClassifier(DecisionTreeClassifier(criterion="entropy", max_depth=3))
clf1.fit(X_train, y_train)
prediction = clf1.predict(X_test)
print('Accuracy of DecisionTreeClassifier on training set: {:.2f}'.format(clf1.score(X_train, y_train)))
print('Accuracy of DecisionTreeClassifier on test set:     {:.2f}'.format(clf1.score(X_test, y_test)))

In [None]:
print("\n Classification report for classifier %s:\n%s\n" % (clf1, metrics.classification_report(y_test, prediction)))

# **Resume Parsing**

In [None]:
!pip install tika

In [None]:
from tika import parser

In [None]:
file = r'/kaggle/input/resume-dataset/data/data/INFORMATION-TECHNOLOGY/10553553.pdf'
file_data = parser.from_file(file)

In [None]:
text = file_data['content']
#print(text)

In [None]:
parsed_content = {}

In [None]:
#E-MAIL
import re
def get_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)

email = get_email_addresses(text)
print(email)
parsed_content['E-mail'] = email

In [None]:
#PHONE NUMBER
import re
def get_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', num) for num in phone_numbers]

phone_number= get_phone_numbers(text)
if len(phone_number) <= 10:
    print(phone_number)
    parsed_content['Phone number'] = phone_number

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def extract_name(text):
    nlp_text = nlp(text)
    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
  
    matcher.add('NAME', [pattern], on_match = None)
  
    matches = matcher(nlp_text)
  
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text

name = extract_name(text)
print(name)
parsed_content['Name'] =  name

In [None]:
Keywords = ["education",
            "summary",
            "accomplishments",
            "executive profile",
            "professional profile",
            "personal profile",
            "work background",
            "academic profile",
            "other activities",
            "qualifications",
            "experience",
            "interests",
            "skills",
            "achievements",
            "publications",
            "publication",
            "certifications",
            "workshops",
            "projects",
            "internships",
            "trainings",
            "hobbies",
            "overview",
            "objective",
            "position of responsibility",
            "jobs"
           ]

In [None]:
text = text.replace("\n"," ")
text = text.replace("[^a-zA-Z0-9]", " ");  
re.sub('\W+','', text)
text = text.lower()
#print(text)

In [None]:
content = {}
indices = []
keys = []
for key in Keywords:
    try:
        content[key] = text[text.index(key) + len(key):]
        indices.append(text.index(key))
        keys.append(key)
    except:
        pass
    

In [None]:
#Sorting the indices
zipped_lists = zip(indices, keys)
sorted_pairs = sorted(zipped_lists)
sorted_pairs

tuples = zip(*sorted_pairs)
indices, keys = [ list(tuple) for tuple in  tuples]
keys

In [None]:
#Keeping the required content and removing the redundant part
content = []
for idx in range(len(indices)):
    if idx != len(indices)-1:
        content.append(text[indices[idx]: indices[idx+1]])
    else:
        content.append(text[indices[idx]: ])

In [None]:
for i in range(len(indices)):
    parsed_content[keys[i]] = content[i]  

In [None]:
parsed_content

In [None]:
pd.DataFrame(parsed_content.items()) 

In [None]:
parsed_content['skills']

<a id="Question & Answering Model"></a>
# Question & Answering Model

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 which is incompatible.
beatrix-jupyterlab 2023.128.151533 requires jupyterlab~=3.6.0, but you have jupyterlab 4.1.2 which is incompatible.
cudf 23.8.0 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.3.0 

In [4]:
import os
import transformers
import torch
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer
from kaggle_secrets import UserSecretsClient

2024-03-31 09:52:08.209111: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 09:52:08.209198: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 09:52:08.375089: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
user_secrets = UserSecretsClient()
os.environ["HF_TOKEN"]= user_secrets.get_secret("HF_TOKEN")
#os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [6]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             token=os.environ['HF_TOKEN'])

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [8]:
print (type(tokenizer))

<class 'transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast'>


In [9]:
text = "Question :What are fields in Information Technology ?"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens = 150)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Question :What are fields in Information Technology ?

Answer :

Fields in Information Technology are the areas of study that deal with the design, development, and implementation of computer systems. They include topics such as computer architecture, software engineering, database management, and networking.

Computer architecture is the study of how computers are designed and built, including topics such as memory, processors, and input/output devices. Software engineering is the study of how to design, develop, and test computer programs, including topics such as algorithms, data structures, and debugging. Database management is the study of how to design, implement, and manage databases, including topics such as relational databases, object-relational databases, and distributed databases. Networking is the study of how to design, implement, and manage computer networks,


In [10]:
text = "Question : What are fields in Information Technology ?"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=150)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Question : What are fields in Information Technology ?

Answer :

Fields in Information Technology are the areas of study or specialization in the field of Information Technology.

Fields in Information Technology are the areas of study or specialization in the field of Information Technology.

Fields in Information Technology are the areas of study or specialization in the field of Information Technology.

Fields in Information Technology are the areas of study or specialization in the field of Information Technology.

Fields in Information Technology are the areas of study or specialization in the field of Information Technology.

Fields in Information Technology are the areas of study or specialization in the field of Information Technology.

Fields in Information Technology are the areas of study or specialization in the field of Information Technology.

Fields in Information Technology are the areas of study or specialization in the


In [11]:
os.environ["WANDB_DISABLED"] = "false"

In [12]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [13]:
from datasets import load_dataset
data = load_dataset('csv', data_files = "/kaggle/input/essentiall/Smart Intern Dataset .csv")

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [14]:
data = data.map(lambda samples: tokenizer(samples["Question"]), batched=True)

Map:   0%|          | 0/148 [00:00<?, ? examples/s]

In [15]:
data['train']['Question']

['"what are the skills and experience that I should have as Data Networking Intern?"',
 '"I am an engineering assistant skills  ability to make decisions and solve problems ability to work in a team structure attention to detail balanced computer proficient dependable determined educated fast learner flexible friendly  hardworking honest issue resolution interpersonal skills kind outgoing quick on my feet & with my hands reliable strong verbal communication time management very organized & energetic computer proficient  I want to gain a practical experiences to get a job in this field "',
 '"As an HR personal assistant with a strong background in administrative support, auditing, and various other skills, gaining further experience in your field can enhance your career prospects and professional development significantly. Here are several strategies to gain experience and elevate your career in HR and administrative support:"',
 '" I have education bachelor\'s of arts : political scien

In [16]:
def formatting_func(example):
    text = f"Question: {example['Question'][0]}\nAnswer: {example['Answer'][0]}"
    return [text]

In [17]:
data['train']

Dataset({
    features: ['Question', 'Answer', 'input_ids', 'attention_mask'],
    num_rows: 148
})

In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)



Map:   0%|          | 0/148 [00:00<?, ? examples/s]



In [19]:
trainer.train()#c1b73ca72fb3109769dc38673f0fd2f05fd2e87c

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ·····························································


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 61
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
1,0.4505
2,0.4505
3,0.4427
4,0.4259
5,0.4066
6,0.3852
7,0.3624
8,0.338
9,0.3125
10,0.2869


TrainOutput(global_step=100, training_loss=0.05637011506391586, metrics={'train_runtime': 453.4761, 'train_samples_per_second': 1.764, 'train_steps_per_second': 0.221, 'total_flos': 914185820160000.0, 'train_loss': 0.05637011506391586, 'epoch': 100.0})

In [20]:
text = "Question : I'm Lina, and I've just finished my studies in marketing. Despite my degree, I feel unprepared for the digital aspects of marketing, which seem to dominate job listings. I need to gain practical experience in digital marketing strategies and tools. Does Smart Intern offer simulations that can help me build these skills and receive evaluations on my performance?"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens = 600)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Question : I'm Lina, and I've just finished my studies in marketing. Despite my degree, I feel unprepared for the digital aspects of marketing, which seem to dominate job listings. I need to gain practical experience in digital marketing strategies and tools. Does Smart Intern offer simulations that can help me build these skills and receive evaluations on my performance?

Answer : Smart Intern provides a unique opportunity to gain hands-on experience in digital marketing through interactive simulations that simulate the processes and strategies of digital marketing. By engaging with these simulations, you can learn essential skills such as keyword research, link building, social media marketing, email marketing, and advertising campaigns, without needing to invest time or money in actual projects. The simulations provide a hands-on experience that can prepare you well for actual projects and interviews by giving you a clear understanding of the strategies and tools that you should be 

In [21]:
import cloudpickle
cloudpickle.dump(model, open('model.pkl', 'wb'))

In [22]:
pickled_model = cloudpickle.load(open('/kaggle/working/model.pkl', 'rb'))

In [25]:
text = "Question : I'm Lina, and I've just finished my studies in marketing. Despite my degree, I feel unprepared for the digital aspects of marketing, which seem to dominate job listings. I need to gain practical experience in digital marketing strategies and tools. Does Smart Intern offer simulations that can help me build these skills and receive evaluations on my performance?"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = pickled_model.generate(**inputs, max_new_tokens = 500)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Question : I'm Lina, and I've just finished my studies in marketing. Despite my degree, I feel unprepared for the digital aspects of marketing, which seem to dominate job listings. I need to gain practical experience in digital marketing strategies and tools. Does Smart Intern offer simulations that can help me build these skills and receive evaluations on my performance?

Answer : Smart Intern provides a unique opportunity to gain hands-on experience in digital marketing through interactive simulations that simulate the processes and strategies of digital marketing. By engaging with these simulations, you can learn essential skills such as keyword research, link building, social media marketing, email marketing, and advertising campaigns, without needing to invest time or money in actual projects. The simulations provide a hands-on experience that can prepare you well for actual projects and interviews by giving you a clear understanding of the strategies and tools that you should be 

In [None]:
import zipfile
import os

# Specify the file you want to compress
file_to_compress = "/kaggle/working/model2.pkl"
# Specify the output ZIP file name
zip_filename = "/kaggle/working/model2.zip"

# Create a ZIP file and add your file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(file_to_compress, arcname=os.path.basename(file_to_compress))

print(f"File compressed and saved as {zip_filename}")

In [None]:
import gzip
import shutil

# Specify the file you want to compress
file_to_compress = "/kaggle/working/model2.pkl"
# Specify the output GZ file name
gz_filename = "/kaggle/working/model2.pkl.gz"

# Compress the file
with open(file_to_compress, 'rb') as f_in:
    with gzip.open(gz_filename, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print(f"File compressed and saved as {gz_filename}")

<a id="Deployment Model with Streamlit"></a>
# Deployment Model with Streamlit

In [None]:
!pip install streamlit

In [None]:
import streamlit as st

In [None]:
def SmartIntern_prediction(input_text) :
    device = "cuda:0"
    
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    
    outputs = pickled_model.generate(**inputs, max_new_tokens = 500)
    
    return(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
def main():
    
    st.title("Smart Intern")
    st.header("How can i help you !")
    input_text = st.text_input('text input')
    output = SmartIntern_prediction(input_text)
    st.success(output)
    

In [None]:
if __name__ == '__main__':
    
    main()

In [None]:
st run main